evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -2,80 +2,90 @@ import importlib
|
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.dataset.dataset import DatasetDict
|
|
8
|
+
from evalscope.api.dataset.loader import DictDataLoader
|
|
9
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
10
|
+
from evalscope.api.metric import Score
|
|
11
|
+
from evalscope.api.model import Model, ModelOutput
|
|
12
|
+
from evalscope.api.registry import register_benchmark
|
|
13
|
+
from evalscope.constants import Tags
|
|
7
14
|
from evalscope.utils import get_logger
|
|
15
|
+
from evalscope.utils.function_utils import run_once
|
|
16
|
+
from evalscope.utils.import_utils import check_import
|
|
8
17
|
|
|
9
18
|
logger = get_logger()
|
|
10
19
|
|
|
11
20
|
|
|
12
|
-
@
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
'
|
|
30
|
-
|
|
21
|
+
@register_benchmark(
|
|
22
|
+
BenchmarkMeta(
|
|
23
|
+
name='tau_bench',
|
|
24
|
+
pretty_name='τ-bench',
|
|
25
|
+
tags=[Tags.FUNCTION_CALLING, Tags.REASONING],
|
|
26
|
+
description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
|
|
27
|
+
'and a language agent provided with domain-specific API tools and policy guidelines. '
|
|
28
|
+
'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` '
|
|
29
|
+
'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/tau_bench.html)', # noqa: E501
|
|
30
|
+
dataset_id='https://github.com/sierra-research/tau-bench',
|
|
31
|
+
subset_list=['airline', 'retail'],
|
|
32
|
+
metric_list=['Pass^1'],
|
|
33
|
+
eval_split='test',
|
|
34
|
+
extra_params={
|
|
35
|
+
'user_model': 'qwen-plus',
|
|
36
|
+
'api_key': 'EMPTY',
|
|
37
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
38
|
+
'generation_config': {
|
|
39
|
+
'temperature': 0.0,
|
|
40
|
+
'max_tokens': 4096,
|
|
41
|
+
}
|
|
31
42
|
}
|
|
32
|
-
|
|
33
|
-
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
class TauBenchAdapter(DefaultDataAdapter):
|
|
34
46
|
|
|
35
47
|
def __init__(self, **kwargs):
|
|
36
48
|
super().__init__(**kwargs)
|
|
37
49
|
|
|
38
|
-
|
|
39
|
-
if spec is None:
|
|
40
|
-
raise ImportError(
|
|
41
|
-
'`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
metric_registry.register(Metric(name='Pass^1', object=mean))
|
|
50
|
+
check_import('tau_bench', package='git+https://github.com/sierra-research/tau-bench', raise_error=True)
|
|
45
51
|
|
|
46
52
|
# setup user model args
|
|
47
|
-
|
|
48
|
-
self.
|
|
49
|
-
self.
|
|
50
|
-
self.
|
|
51
|
-
self.generation_config = extra_params.get('generation_config', {'temperature': 0.7, 'max_new_tokens': 1024})
|
|
53
|
+
self.user_model = self.extra_params.get('user_model', 'qwen-plus')
|
|
54
|
+
self.api_key = self.extra_params.get('api_key', 'EMPTY')
|
|
55
|
+
self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
|
|
56
|
+
self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
|
|
52
57
|
|
|
53
58
|
self._patch_env_completion()
|
|
54
59
|
|
|
60
|
+
@run_once
|
|
55
61
|
def _patch_env_completion(self) -> str:
|
|
56
62
|
from tau_bench.envs.user import LLMUserSimulationEnv
|
|
57
63
|
|
|
58
64
|
def new_generate_next_message(self, messages):
|
|
59
|
-
from evalscope.
|
|
65
|
+
from evalscope.api.messages import dict_to_chat_message
|
|
66
|
+
from evalscope.api.model import GenerateConfig, get_model
|
|
67
|
+
from evalscope.constants import EvalType
|
|
68
|
+
|
|
69
|
+
user_server = get_model(
|
|
70
|
+
model=adapter_instance.user_model,
|
|
71
|
+
eval_type=EvalType.SERVICE,
|
|
72
|
+
base_url=adapter_instance.api_base,
|
|
73
|
+
api_key=adapter_instance.api_key,
|
|
74
|
+
config=GenerateConfig(**adapter_instance.generation_config)
|
|
75
|
+
)
|
|
60
76
|
|
|
61
|
-
|
|
62
|
-
api_url=adapter_instance.api_base,
|
|
63
|
-
model_id=adapter_instance.user_model,
|
|
64
|
-
api_key=adapter_instance.api_key)
|
|
65
|
-
request_json = user_server.make_request(
|
|
66
|
-
input_item={'messages': messages}, infer_cfg=adapter_instance.generation_config)
|
|
67
|
-
res = user_server.send_request(request_json)
|
|
77
|
+
res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
|
|
68
78
|
|
|
69
|
-
message =
|
|
79
|
+
message = {'role': 'assistant', 'content': res.completion}
|
|
70
80
|
self.messages.append(message)
|
|
71
81
|
self.total_cost = 0
|
|
72
|
-
return
|
|
82
|
+
return res.completion
|
|
73
83
|
|
|
74
84
|
# get the current instance of TauBenchAdapter
|
|
75
85
|
adapter_instance = self
|
|
76
86
|
LLMUserSimulationEnv.generate_next_message = new_generate_next_message
|
|
77
87
|
|
|
78
|
-
def load(self
|
|
88
|
+
def load(self):
|
|
79
89
|
from tau_bench.envs import get_env
|
|
80
90
|
|
|
81
91
|
data_dict = defaultdict(dict)
|
|
@@ -94,17 +104,61 @@ class TauBenchAdapter(DataAdapter):
|
|
|
94
104
|
'task_index': i,
|
|
95
105
|
'env_name': env_name,
|
|
96
106
|
})
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
return
|
|
107
|
+
# load dataset
|
|
108
|
+
dataset = DictDataLoader(
|
|
109
|
+
dict_list=tasks,
|
|
110
|
+
sample_fields=self.record_to_sample,
|
|
111
|
+
limit=self.limit,
|
|
112
|
+
repeats=self.repeats,
|
|
113
|
+
shuffle=self.shuffle,
|
|
114
|
+
).load()
|
|
115
|
+
|
|
116
|
+
data_dict[env_name] = dataset
|
|
117
|
+
|
|
118
|
+
test_dataset = DatasetDict(data_dict)
|
|
119
|
+
|
|
120
|
+
return test_dataset, None
|
|
121
|
+
|
|
122
|
+
def record_to_sample(self, record: Dict) -> Sample:
|
|
123
|
+
"""Convert a data record to a Sample object."""
|
|
124
|
+
return Sample(
|
|
125
|
+
input=[ChatMessageUser(content='')],
|
|
126
|
+
target='', # Will use the record for evaluation
|
|
127
|
+
subset_key=record['env_name'],
|
|
128
|
+
metadata=record # Store the full record for evaluation
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
132
|
+
from .generation import predict
|
|
133
|
+
return predict(model, sample)
|
|
134
|
+
|
|
135
|
+
def match_score(self, original_prediction: str, filtered_prediction: str, reference: str, task_state) -> Score:
|
|
136
|
+
|
|
137
|
+
score = Score(
|
|
138
|
+
extracted_prediction=filtered_prediction,
|
|
139
|
+
prediction=original_prediction,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
# Parse the prediction to get the reward
|
|
144
|
+
task_result = task_state.metadata['task_result']
|
|
145
|
+
reward = task_result.get('reward', 0.0)
|
|
146
|
+
|
|
147
|
+
score.value = {
|
|
148
|
+
'Pass^1': float(reward),
|
|
149
|
+
}
|
|
150
|
+
score.explanation = f'Task completed with reward: {reward}'
|
|
151
|
+
score.metadata = {
|
|
152
|
+
'task_result': task_result,
|
|
153
|
+
'env_name': task_state.metadata.get('env_name', 'unknown'),
|
|
154
|
+
'task_index': task_state.metadata.get('task_index', -1)
|
|
155
|
+
}
|
|
156
|
+
score.main_score_name = 'Pass^1'
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
score.value = {'Pass^1': 0.0}
|
|
160
|
+
score.explanation = f'Evaluation failed: {str(e)}'
|
|
161
|
+
score.metadata = {'error': str(e)}
|
|
162
|
+
score.main_score_name = 'Pass^1'
|
|
163
|
+
|
|
164
|
+
return score
|
|
File without changes
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
6
|
+
from evalscope.api.metric.scorer import AggScore, Score
|
|
7
|
+
from evalscope.api.registry import get_metric, register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.metrics import mean
|
|
10
|
+
from evalscope.utils.function_utils import thread_safe
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_benchmark(
|
|
17
|
+
BenchmarkMeta(
|
|
18
|
+
name='evalmuse',
|
|
19
|
+
pretty_name='EvalMuse',
|
|
20
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
21
|
+
description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
|
|
22
|
+
'and semantic alignment of finely generated images',
|
|
23
|
+
tags=[Tags.TEXT_TO_IMAGE],
|
|
24
|
+
subset_list=['EvalMuse'],
|
|
25
|
+
metric_list=['FGA_BLIP2Score'],
|
|
26
|
+
few_shot_num=0,
|
|
27
|
+
train_split=None,
|
|
28
|
+
eval_split='test',
|
|
29
|
+
)
|
|
30
|
+
)
|
|
31
|
+
class EvalMuseAdapter(Text2ImageAdapter):
|
|
32
|
+
|
|
33
|
+
def __init__(self, **kwargs):
|
|
34
|
+
super().__init__(**kwargs)
|
|
35
|
+
assert len(self.metric_list
|
|
36
|
+
) == 1 and self.metric_list[0] == 'FGA_BLIP2Score', 'Only FGA_BLIP2Score is supported for EvalMuse'
|
|
37
|
+
|
|
38
|
+
@thread_safe
|
|
39
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state):
|
|
40
|
+
# Get prediction and prompt from task state
|
|
41
|
+
image_path = task_state.metadata.get('image_path', original_prediction)
|
|
42
|
+
|
|
43
|
+
# Initialize the score object with prediction details
|
|
44
|
+
score = Score(
|
|
45
|
+
extracted_prediction=image_path,
|
|
46
|
+
prediction=image_path,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Calculate scores for each configured metric
|
|
50
|
+
try:
|
|
51
|
+
metric_name = self.metric_list[0]
|
|
52
|
+
metric_cls = get_metric(metric_name)
|
|
53
|
+
metric_func = metric_cls() # Initialize with parameters
|
|
54
|
+
metric_score = metric_func(image_path, task_state.metadata)[0]
|
|
55
|
+
|
|
56
|
+
for k, v in metric_score.items():
|
|
57
|
+
score.value[f'{metric_name}:{k}'] = v.cpu().item()
|
|
58
|
+
except Exception as e:
|
|
59
|
+
logger.error(f'Error calculating metric {metric_name}: {e}')
|
|
60
|
+
score.value[metric_name] = 0
|
|
61
|
+
score.metadata[metric_name] = f'error: {str(e)}'
|
|
62
|
+
|
|
63
|
+
return score
|
|
64
|
+
|
|
65
|
+
def aggregate_scores(self, sample_scores) -> List[AggScore]:
|
|
66
|
+
new_items = defaultdict(list)
|
|
67
|
+
agg_list = []
|
|
68
|
+
for sample_score in sample_scores:
|
|
69
|
+
for metric_name, value in sample_score.score.value.items():
|
|
70
|
+
metrics_prefix = metric_name.split(':')[0]
|
|
71
|
+
category = metric_name.rpartition('(')[-1].split(')')[0]
|
|
72
|
+
category = category.split('-')[0].lower() # remove the suffix if exists
|
|
73
|
+
new_items[f'{metrics_prefix}:{category}'].append(value)
|
|
74
|
+
|
|
75
|
+
for k, v in new_items.items():
|
|
76
|
+
agg_list.append(AggScore(metric_name=k, score=mean(v), num=len(v)))
|
|
77
|
+
|
|
78
|
+
return agg_list
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser
|
|
7
|
+
from evalscope.api.registry import get_metric, register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='genai_bench',
|
|
17
|
+
pretty_name='GenAI-Bench',
|
|
18
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
19
|
+
description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
|
|
20
|
+
tags=[Tags.TEXT_TO_IMAGE],
|
|
21
|
+
subset_list=['GenAI-Bench-1600'],
|
|
22
|
+
metric_list=['VQAScore'],
|
|
23
|
+
few_shot_num=0,
|
|
24
|
+
train_split=None,
|
|
25
|
+
eval_split='test',
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class GenAIBenchAdapter(Text2ImageAdapter):
|
|
29
|
+
|
|
30
|
+
def __init__(self, **kwargs):
|
|
31
|
+
super().__init__(**kwargs)
|
|
32
|
+
|
|
33
|
+
def load_from_disk(self, **kwargs):
|
|
34
|
+
if os.path.isfile(self.dataset_id):
|
|
35
|
+
file_name = os.path.basename(self.dataset_id)
|
|
36
|
+
file_without_ext = os.path.splitext(file_name)[0]
|
|
37
|
+
self.subset_list = [file_without_ext]
|
|
38
|
+
|
|
39
|
+
return super().load_from_disk(use_local_loader=True)
|
|
40
|
+
|
|
41
|
+
def record_to_sample(self, record) -> Sample:
|
|
42
|
+
"""Convert a record dictionary to a Sample object."""
|
|
43
|
+
advanced = record['tags'].get('advanced')
|
|
44
|
+
return Sample(
|
|
45
|
+
input=[ChatMessageUser(content=record['prompt'])],
|
|
46
|
+
metadata={
|
|
47
|
+
'id': record['id'],
|
|
48
|
+
'prompt': record['prompt'],
|
|
49
|
+
'category': 'advanced' if advanced else 'basic',
|
|
50
|
+
'tags': record.get('tags', []),
|
|
51
|
+
'image_path': record.get('image_path', ''), # Optional field for existing image path
|
|
52
|
+
}
|
|
53
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='general_t2i',
|
|
17
|
+
dataset_id='general_t2i',
|
|
18
|
+
description='General Text-to-Image Benchmark',
|
|
19
|
+
tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
|
|
20
|
+
subset_list=['default'],
|
|
21
|
+
metric_list=['PickScore'],
|
|
22
|
+
few_shot_num=0,
|
|
23
|
+
train_split=None,
|
|
24
|
+
eval_split='test',
|
|
25
|
+
)
|
|
26
|
+
)
|
|
27
|
+
class GeneralT2IAdapter(Text2ImageAdapter):
|
|
28
|
+
|
|
29
|
+
def __init__(self, **kwargs):
|
|
30
|
+
|
|
31
|
+
super().__init__(**kwargs)
|
|
32
|
+
|
|
33
|
+
def load_from_disk(self, **kwargs):
|
|
34
|
+
if os.path.isfile(self.dataset_id):
|
|
35
|
+
file_name = os.path.basename(self.dataset_id)
|
|
36
|
+
file_without_ext = os.path.splitext(file_name)[0]
|
|
37
|
+
self.subset_list = [file_without_ext]
|
|
38
|
+
|
|
39
|
+
return super().load_from_disk(use_local_loader=True)
|
|
40
|
+
|
|
41
|
+
def record_to_sample(self, record):
|
|
42
|
+
return Sample(input=[ChatMessageUser(content=record['prompt'])], metadata={'image_path': record['image_path']})
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='hpdv2',
|
|
17
|
+
pretty_name='HPD-v2',
|
|
18
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
19
|
+
description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
|
|
20
|
+
'trained on the Human Preference Dataset (HPD v2)',
|
|
21
|
+
tags=[Tags.TEXT_TO_IMAGE],
|
|
22
|
+
subset_list=['HPDv2'],
|
|
23
|
+
metric_list=['HPSv2.1Score'],
|
|
24
|
+
few_shot_num=0,
|
|
25
|
+
train_split=None,
|
|
26
|
+
eval_split='test',
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class HPDv2Adapter(Text2ImageAdapter):
|
|
30
|
+
|
|
31
|
+
def __init__(self, **kwargs):
|
|
32
|
+
super().__init__(**kwargs)
|
|
33
|
+
|
|
34
|
+
def load_from_disk(self, **kwargs):
|
|
35
|
+
if os.path.isfile(self.dataset_id):
|
|
36
|
+
file_name = os.path.basename(self.dataset_id)
|
|
37
|
+
file_without_ext = os.path.splitext(file_name)[0]
|
|
38
|
+
self.subset_list = [file_without_ext]
|
|
39
|
+
|
|
40
|
+
return super().load_from_disk(use_local_loader=True)
|
|
41
|
+
|
|
42
|
+
def record_to_sample(self, record):
|
|
43
|
+
return Sample(
|
|
44
|
+
input=[ChatMessageUser(content=record['prompt'])],
|
|
45
|
+
metadata={
|
|
46
|
+
'id': record['id'],
|
|
47
|
+
'prompt': record['prompt'],
|
|
48
|
+
'category': record.get('tags', {}).get('category', ''),
|
|
49
|
+
'tags': record.get('tags', {}),
|
|
50
|
+
'image_path': record.get('image_path', ''), # Optional field for existing image path
|
|
51
|
+
}
|
|
52
|
+
)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
|
|
3
|
+
from evalscope.api.registry import register_benchmark
|
|
4
|
+
from evalscope.constants import Tags
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
|
|
7
|
+
logger = get_logger()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_benchmark(
|
|
11
|
+
BenchmarkMeta(
|
|
12
|
+
name='tifa160',
|
|
13
|
+
pretty_name='TIFA-160',
|
|
14
|
+
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
15
|
+
description='TIFA-160 Text-to-Image Benchmark',
|
|
16
|
+
tags=[Tags.TEXT_TO_IMAGE],
|
|
17
|
+
subset_list=['TIFA-160'],
|
|
18
|
+
metric_list=['PickScore'],
|
|
19
|
+
few_shot_num=0,
|
|
20
|
+
train_split=None,
|
|
21
|
+
eval_split='test',
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
class TIFA_Adapter(Text2ImageAdapter):
|
|
25
|
+
|
|
26
|
+
def __init__(self, **kwargs):
|
|
27
|
+
super().__init__(**kwargs)
|
|
@@ -1,81 +1,102 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from typing import
|
|
3
|
-
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages.chat_message import ChatMessage, dict_to_chat_message
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_benchmark(
|
|
17
|
+
BenchmarkMeta(
|
|
18
|
+
name='tool_bench',
|
|
19
|
+
pretty_name='ToolBench-Static',
|
|
20
|
+
tags=[Tags.REASONING, Tags.FUNCTION_CALLING],
|
|
21
|
+
description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
|
|
22
|
+
'It includes various subsets such as in-domain and out-of-domain, '
|
|
23
|
+
'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
|
|
24
|
+
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)',
|
|
25
|
+
dataset_id='AI-ModelScope/ToolBench-Static',
|
|
26
|
+
subset_list=['in_domain', 'out_of_domain'],
|
|
27
|
+
metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
|
|
28
|
+
eval_split='test',
|
|
29
|
+
)
|
|
23
30
|
)
|
|
24
|
-
class ToolBenchAdapter(
|
|
31
|
+
class ToolBenchAdapter(DefaultDataAdapter):
|
|
32
|
+
"""
|
|
33
|
+
ToolBench adapter using the new data processing framework.
|
|
34
|
+
"""
|
|
25
35
|
|
|
26
36
|
def __init__(self, **kwargs):
|
|
27
37
|
super().__init__(**kwargs)
|
|
28
38
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
36
|
-
"""
|
|
37
|
-
Generate model prompt from input data.
|
|
38
|
-
"""
|
|
39
|
-
messages = input_d['messages']
|
|
40
|
-
# use prepared messages and remove the name field
|
|
39
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
40
|
+
"""Convert a data record to a Sample object."""
|
|
41
|
+
messages = record['messages']
|
|
42
|
+
|
|
43
|
+
# Process messages and remove the name field, convert function messages
|
|
44
|
+
processed_messages = []
|
|
41
45
|
for message in messages:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
46
|
+
msg_dict = message.copy()
|
|
47
|
+
if 'name' in msg_dict:
|
|
48
|
+
del msg_dict['name']
|
|
49
|
+
if 'role' in msg_dict:
|
|
50
|
+
if msg_dict['role'] == 'function':
|
|
51
|
+
content = json.dumps(msg_dict, ensure_ascii=False)
|
|
52
|
+
msg_dict['role'] = 'user'
|
|
53
|
+
msg_dict['content'] = content
|
|
54
|
+
|
|
55
|
+
# Convert to ChatMessage object
|
|
56
|
+
chat_msg = dict_to_chat_message(msg_dict)
|
|
57
|
+
processed_messages.append(chat_msg)
|
|
58
|
+
|
|
59
|
+
return Sample(
|
|
60
|
+
input=processed_messages,
|
|
61
|
+
target='', # Store the full record as target for evaluation
|
|
62
|
+
metadata={
|
|
63
|
+
'target': record['target'],
|
|
64
|
+
'tools': record['tools'],
|
|
65
|
+
'messages': record['messages']
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def match_score(
|
|
70
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
71
|
+
) -> Score:
|
|
67
72
|
from .utils import calculate_metrics
|
|
68
73
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
score = Score(
|
|
75
|
+
extracted_prediction=filtered_prediction,
|
|
76
|
+
prediction=original_prediction,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
doc = task_state.metadata
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
data = {
|
|
83
|
+
'target': doc['target'],
|
|
84
|
+
'predictions': filtered_prediction,
|
|
85
|
+
'tools': doc['tools'],
|
|
86
|
+
}
|
|
87
|
+
metrics = calculate_metrics(data)
|
|
88
|
+
|
|
89
|
+
score.value = metrics
|
|
90
|
+
score.explanation = f'Metrics: {metrics}'
|
|
91
|
+
score.metadata = {'target': doc['target'], 'tools': doc['tools'], 'detailed_metrics': metrics}
|
|
92
|
+
# Set the main score (you can choose the most important metric)
|
|
93
|
+
score.main_score_name = 'F1'
|
|
76
94
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
95
|
+
except Exception as e:
|
|
96
|
+
# Handle evaluation errors
|
|
97
|
+
score.value = {'Act.EM': 0.0, 'Plan.EM': 0.0, 'F1': 0.0, 'HalluRate': 1.0, 'Rouge-L': 0.0}
|
|
98
|
+
score.explanation = f'Evaluation failed: {str(e)}'
|
|
99
|
+
score.metadata = {'error': str(e)}
|
|
100
|
+
score.main_score_name = 'F1'
|
|
80
101
|
|
|
81
|
-
return
|
|
102
|
+
return score
|