evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -1,17 +1,16 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
import os
|
|
5
|
-
import random
|
|
6
4
|
import re
|
|
5
|
+
from typing import Any, Dict
|
|
7
6
|
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.
|
|
10
|
-
from evalscope.
|
|
7
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
8
|
+
from evalscope.api.dataset import Sample
|
|
9
|
+
from evalscope.api.evaluator import TaskState
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
11
12
|
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
13
|
-
# flake8: noqa
|
|
14
|
-
|
|
15
14
|
logger = get_logger()
|
|
16
15
|
|
|
17
16
|
# BBH multiple choice subset list
|
|
@@ -55,160 +54,89 @@ FREE_FORM_LIST = [
|
|
|
55
54
|
TASK_TYPE = 'task_type'
|
|
56
55
|
SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
57
56
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
57
|
+
PROMPT_TEMPLATE = """
|
|
58
|
+
Q: {question}
|
|
59
|
+
A: Let's think step by step. Put your final answer in the format of "So the answer is $ANSWER" (without quotes and markdown) where $ANSWER is the answer to the problem.
|
|
60
|
+
""".lstrip() # noqa: E501
|
|
61
|
+
|
|
62
|
+
FEWSHOT_TEMPLATE = """
|
|
63
|
+
{fewshot}
|
|
64
|
+
|
|
65
|
+
""".lstrip() + PROMPT_TEMPLATE
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@register_benchmark(
|
|
69
|
+
BenchmarkMeta(
|
|
70
|
+
name='bbh',
|
|
71
|
+
pretty_name='BBH',
|
|
72
|
+
dataset_id='evalscope/bbh',
|
|
73
|
+
tags=[Tags.REASONING],
|
|
74
|
+
description=
|
|
75
|
+
'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
|
|
76
|
+
subset_list=SUBSET_LIST,
|
|
77
|
+
few_shot_num=3,
|
|
78
|
+
train_split=None,
|
|
79
|
+
eval_split='test',
|
|
80
|
+
metric_list=['acc'],
|
|
81
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
82
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
83
|
+
)
|
|
72
84
|
)
|
|
73
|
-
class BBHAdapter(
|
|
85
|
+
class BBHAdapter(DefaultDataAdapter):
|
|
74
86
|
"""
|
|
75
87
|
Adapter for BBH free-form and multiple-choices sub-tasks.
|
|
76
88
|
"""
|
|
77
89
|
|
|
78
90
|
def __init__(self, **kwargs):
|
|
79
|
-
|
|
80
91
|
few_shot_num = kwargs.get('few_shot_num', 3)
|
|
81
92
|
|
|
82
93
|
if few_shot_num != 3 and few_shot_num != 0:
|
|
83
|
-
logger.error(
|
|
84
|
-
|
|
94
|
+
logger.error(
|
|
95
|
+
f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
|
|
96
|
+
f'Use 3-shot by default.'
|
|
97
|
+
)
|
|
85
98
|
kwargs['few_shot_num'] = 3
|
|
86
99
|
|
|
87
100
|
super().__init__(**kwargs)
|
|
88
101
|
|
|
89
|
-
def
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
{'data': ['xxx']}
|
|
121
|
-
"""
|
|
122
|
-
# few_shot_list: should be ['xxxx']
|
|
123
|
-
if len(few_shot_list) > 0:
|
|
124
|
-
cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
|
|
125
|
-
else:
|
|
126
|
-
cot_prompts = ''
|
|
127
|
-
full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
|
|
128
|
-
|
|
129
|
-
return self.gen_prompt_data(full_prompt)
|
|
130
|
-
|
|
131
|
-
def gen_prompts(self, data_dict: dict) -> dict:
|
|
132
|
-
"""
|
|
133
|
-
Generate dataset prompts from raw input, unify the prompt format for different datasets.
|
|
134
|
-
|
|
135
|
-
Args:
|
|
136
|
-
data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
|
|
137
|
-
|
|
138
|
-
Returns:
|
|
139
|
-
{'subset_name': [prompt_d_1, prompt_d_2, ...]}
|
|
140
|
-
prompt_d_i (dict): refer to the output of gen_prompt method.
|
|
141
|
-
|
|
142
|
-
e.g. train -- few-shot data, test -- target dataset to evaluate.
|
|
143
|
-
"""
|
|
144
|
-
res_dict: dict = {}
|
|
145
|
-
|
|
146
|
-
if self.few_shot_num < 0:
|
|
147
|
-
raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
|
|
148
|
-
|
|
149
|
-
logger.info(f'Use default settings: '
|
|
150
|
-
f'> few_shot_num: {self.few_shot_num}, '
|
|
151
|
-
f'> few_shot_split: {self.train_split}, '
|
|
152
|
-
f'> target_eval_split: {self.eval_split}')
|
|
153
|
-
|
|
154
|
-
for sub_name, sub_data_dict in data_dict.items():
|
|
155
|
-
few_shot_data = []
|
|
156
|
-
if self.few_shot_num > 0:
|
|
157
|
-
with open(
|
|
158
|
-
os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r',
|
|
159
|
-
encoding='utf-8') as f:
|
|
160
|
-
cot_prompt_str = f.read()
|
|
161
|
-
few_shot_data = [cot_prompt_str]
|
|
162
|
-
|
|
163
|
-
res_dict[sub_name] = []
|
|
164
|
-
for sample_d in sub_data_dict[self.eval_split]:
|
|
165
|
-
prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=few_shot_data)
|
|
166
|
-
sample_d_new = sample_d.copy()
|
|
167
|
-
if sub_name in MULTIPLE_CHOICE_LIST:
|
|
168
|
-
sample_d_new[TASK_TYPE] = MULTIPLE_CHOICE
|
|
169
|
-
elif sub_name in FREE_FORM_LIST:
|
|
170
|
-
sample_d_new[TASK_TYPE] = FREE_FORM
|
|
171
|
-
else:
|
|
172
|
-
raise ValueError(f'Invalid subset name: {sub_name}')
|
|
173
|
-
|
|
174
|
-
prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
|
|
175
|
-
res_dict[sub_name].append(prompt_d)
|
|
176
|
-
|
|
177
|
-
return res_dict
|
|
178
|
-
|
|
179
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
180
|
-
# Get the gold choice
|
|
181
|
-
gold = input_d.get('target', '')
|
|
182
|
-
# remove brackets
|
|
183
|
-
if gold is None:
|
|
184
|
-
logger.error(f'BBHAdapter: gold is None.')
|
|
185
|
-
gold = gold.replace('(', '').replace(')', '')
|
|
186
|
-
return gold
|
|
187
|
-
|
|
188
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
189
|
-
"""
|
|
190
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
191
|
-
|
|
192
|
-
Args:
|
|
193
|
-
result: Predicted answer from the model. Usually a string for chat.
|
|
194
|
-
raw_input_d (dict): The raw input. Depending on the dataset.
|
|
195
|
-
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
196
|
-
|
|
197
|
-
Returns:
|
|
198
|
-
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
199
|
-
"""
|
|
200
|
-
# Note: to use same extraction method for both of checkpoint/service/custom.
|
|
201
|
-
task_type: str = raw_input_d.get(TASK_TYPE)
|
|
102
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
103
|
+
input = record['input']
|
|
104
|
+
target = record['target'].replace('(', '').replace(')', '').strip() # Clean up the target answer
|
|
105
|
+
|
|
106
|
+
# Determine task type based on subset name
|
|
107
|
+
task_type = None
|
|
108
|
+
subset_name = self.current_subset_name
|
|
109
|
+
if subset_name in MULTIPLE_CHOICE_LIST:
|
|
110
|
+
task_type = MULTIPLE_CHOICE
|
|
111
|
+
elif subset_name in FREE_FORM_LIST:
|
|
112
|
+
task_type = FREE_FORM
|
|
113
|
+
|
|
114
|
+
metadata = {TASK_TYPE: task_type}
|
|
115
|
+
|
|
116
|
+
return Sample(input=input, target=target, metadata=metadata, subset_key=subset_name)
|
|
117
|
+
|
|
118
|
+
def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
|
|
119
|
+
# Load CoT prompts from file for BBH
|
|
120
|
+
subset_name = sample.subset_key
|
|
121
|
+
if subset_name:
|
|
122
|
+
cot_file_path = os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{subset_name}.txt')
|
|
123
|
+
if os.path.exists(cot_file_path):
|
|
124
|
+
with open(cot_file_path, 'r', encoding='utf-8') as f:
|
|
125
|
+
fewshot = f.read().strip()
|
|
126
|
+
return self.few_shot_prompt_template.format(
|
|
127
|
+
fewshot=fewshot,
|
|
128
|
+
question=sample.input,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
132
|
+
task_type = task_state.metadata.get(TASK_TYPE)
|
|
202
133
|
|
|
203
134
|
if task_type == MULTIPLE_CHOICE:
|
|
204
|
-
return self._extract_mc_answer(
|
|
135
|
+
return self._extract_mc_answer(prediction)
|
|
205
136
|
elif task_type == FREE_FORM:
|
|
206
|
-
return self._extract_ff_answer(
|
|
137
|
+
return self._extract_ff_answer(prediction)
|
|
207
138
|
else:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
def match(self, gold: str, pred: str) -> float:
|
|
211
|
-
return exact_match(gold=gold, pred=pred)
|
|
139
|
+
return prediction.strip()
|
|
212
140
|
|
|
213
141
|
@classmethod
|
|
214
142
|
def _extract_mc_answer(cls, ans: str) -> str:
|
|
@@ -1,12 +1,17 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
import importlib
|
|
3
1
|
import json
|
|
4
2
|
import re
|
|
5
3
|
import traceback
|
|
6
|
-
from typing import Any,
|
|
7
|
-
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
10
|
+
from evalscope.api.metric import Score
|
|
11
|
+
from evalscope.api.model import Model, ModelOutput
|
|
12
|
+
from evalscope.api.registry import register_benchmark
|
|
13
|
+
from evalscope.constants import Tags
|
|
14
|
+
from evalscope.utils.import_utils import check_import
|
|
10
15
|
from evalscope.utils.logger import get_logger
|
|
11
16
|
|
|
12
17
|
logger = get_logger()
|
|
@@ -32,47 +37,43 @@ SUBJECT_MAPPING = {
|
|
|
32
37
|
}
|
|
33
38
|
|
|
34
39
|
|
|
35
|
-
@
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
40
|
+
@register_benchmark(
|
|
41
|
+
BenchmarkMeta(
|
|
42
|
+
name='bfcl_v3',
|
|
43
|
+
pretty_name='BFCL-v3',
|
|
44
|
+
tags=[Tags.FUNCTION_CALLING],
|
|
45
|
+
description='Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive '
|
|
46
|
+
'and executable function call evaluation** '
|
|
47
|
+
'dedicated to assessing Large Language Models\' (LLMs) ability to invoke '
|
|
48
|
+
'functions. Unlike previous evaluations, '
|
|
49
|
+
'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
|
|
50
|
+
'Need to run `pip install bfcl-eval==2025.6.16` before evaluating. '
|
|
51
|
+
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)',
|
|
52
|
+
dataset_id='AI-ModelScope/bfcl_v3',
|
|
53
|
+
subset_list=list(SUBJECT_MAPPING.keys()),
|
|
54
|
+
metric_list=['acc'],
|
|
55
|
+
eval_split='train',
|
|
56
|
+
extra_params={
|
|
57
|
+
'underscore_to_dot': True,
|
|
58
|
+
'is_fc_model': True,
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
class BFCLAdapter(DefaultDataAdapter):
|
|
63
|
+
"""
|
|
64
|
+
BFCL adapter using the new data processing framework.
|
|
65
|
+
"""
|
|
57
66
|
|
|
58
67
|
def __init__(self, **kwargs):
|
|
59
68
|
super().__init__(**kwargs)
|
|
60
69
|
|
|
61
|
-
|
|
62
|
-
if spec is None:
|
|
63
|
-
raise ImportError(
|
|
64
|
-
'`bfcl_eval` not found, please install it with `pip install bfcl-eval` before evaluating.')
|
|
70
|
+
check_import('bfcl_eval', package='bfcl-eval==2025.6.16', raise_error=True)
|
|
65
71
|
|
|
66
72
|
self.category_map = SUBJECT_MAPPING
|
|
73
|
+
self.reformat_subset = True
|
|
67
74
|
|
|
68
|
-
|
|
69
|
-
self.
|
|
70
|
-
self.is_fc_model = extra_params.get('is_fc_model', True)
|
|
71
|
-
|
|
72
|
-
def load(self, **kwargs):
|
|
73
|
-
kwargs['subset_list'] = ['default']
|
|
74
|
-
data_dict = super().load(**kwargs)
|
|
75
|
-
return self.reformat_subset(data_dict, subset_key='subset', format='{}')
|
|
75
|
+
self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
|
|
76
|
+
self.is_fc_model = self.extra_params.get('is_fc_model', True)
|
|
76
77
|
|
|
77
78
|
def preprocess_row(self, row: dict):
|
|
78
79
|
"""
|
|
@@ -87,151 +88,167 @@ class BFCLAdapter(DataAdapter):
|
|
|
87
88
|
row['initial_config'] = json.loads(row['initial_config'])
|
|
88
89
|
row['is_fc_model'] = self.is_fc_model
|
|
89
90
|
|
|
90
|
-
def
|
|
91
|
-
|
|
91
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
92
|
+
"""Convert a data record to a Sample object."""
|
|
93
|
+
self.preprocess_row(record)
|
|
92
94
|
|
|
93
95
|
# If the model is a function calling model, we need to remove the system prompt
|
|
94
96
|
if self.is_fc_model:
|
|
95
|
-
turns =
|
|
97
|
+
turns = record['turns']
|
|
96
98
|
new_turns = []
|
|
97
99
|
for turn_idx, messages in enumerate(turns):
|
|
98
100
|
current_messages = messages.copy()
|
|
99
101
|
if len(current_messages) > 0 and current_messages[0]['role'] == 'system':
|
|
100
102
|
current_messages = current_messages[1:]
|
|
101
103
|
new_turns.append(current_messages)
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
return
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
104
|
+
record['turns'] = new_turns
|
|
105
|
+
|
|
106
|
+
return Sample(
|
|
107
|
+
input=[ChatMessageUser(content='')],
|
|
108
|
+
target='', # Will use the record for evaluation
|
|
109
|
+
subset_key=record['subset'],
|
|
110
|
+
metadata=record # Store the full record for evaluation
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
114
|
+
from .generation import predict
|
|
115
|
+
return predict(model, sample)
|
|
116
|
+
|
|
117
|
+
def match_score(
|
|
118
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
119
|
+
) -> Score:
|
|
118
120
|
from bfcl_eval.eval_checker.ast_eval.ast_checker import ast_checker
|
|
119
121
|
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import multi_turn_checker
|
|
120
|
-
from bfcl_eval.model_handler.utils import (
|
|
121
|
-
|
|
122
|
+
from bfcl_eval.model_handler.utils import (
|
|
123
|
+
convert_to_function_call,
|
|
124
|
+
default_decode_ast_prompting,
|
|
125
|
+
default_decode_execute_prompting,
|
|
126
|
+
)
|
|
122
127
|
from bfcl_eval.utils import is_empty_output
|
|
123
128
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
#
|
|
129
|
+
score = Score(
|
|
130
|
+
extracted_prediction=filtered_prediction,
|
|
131
|
+
prediction=original_prediction,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
# NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
|
|
136
|
+
if self.underscore_to_dot:
|
|
137
|
+
dummy_model = 'gpt-4o-2024-11-20-FC'
|
|
138
|
+
else:
|
|
139
|
+
dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
|
|
140
|
+
|
|
141
|
+
row = task_state.metadata
|
|
142
|
+
test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
|
|
143
|
+
|
|
144
|
+
if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
|
|
145
|
+
error = None
|
|
146
|
+
try:
|
|
147
|
+
if self.is_fc_model:
|
|
148
|
+
decoded_tool_calls = []
|
|
149
|
+
for tool_call in row['generation'][0]:
|
|
150
|
+
name = list(tool_call.keys())[0]
|
|
151
|
+
params = tool_call[name]
|
|
152
|
+
decoded_tool_calls.append({name: params})
|
|
153
|
+
else:
|
|
154
|
+
decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
|
|
155
|
+
|
|
156
|
+
# successful decode means valid function call was present
|
|
157
|
+
contains_func_call = True
|
|
158
|
+
if is_empty_output(decoded_tool_calls):
|
|
159
|
+
# Empty output is not considered as a valid function call
|
|
160
|
+
contains_func_call = False
|
|
161
|
+
error = 'Empty decoded output.'
|
|
162
|
+
except Exception:
|
|
152
163
|
contains_func_call = False
|
|
153
|
-
error = '
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
'error_type': 'multi_turn:checker_failed',
|
|
194
|
-
'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
|
|
195
|
-
}
|
|
164
|
+
error = f'Failed to decode with traceback: {traceback.format_exc()}'
|
|
165
|
+
finally:
|
|
166
|
+
valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
|
|
167
|
+
score_result = {'valid': valid, 'error_message': error}
|
|
168
|
+
|
|
169
|
+
elif row['multi_turn']:
|
|
170
|
+
# each step might give a list of tool calls and each turn is multi-step
|
|
171
|
+
# and multi-turn has generations of all the turns
|
|
172
|
+
# hence in a multi-turn setting,
|
|
173
|
+
# multi_turn_decoded_generations is a list of list of list of strings
|
|
174
|
+
multi_turn_decoded_generations: list[list[list[str]]] = []
|
|
175
|
+
for single_turn_generations in row['generation']:
|
|
176
|
+
single_turn_decoded_generations: list[list[str]] = []
|
|
177
|
+
for generation in single_turn_generations:
|
|
178
|
+
try:
|
|
179
|
+
if self.is_fc_model:
|
|
180
|
+
tool_calls = convert_to_function_call(generation)
|
|
181
|
+
else:
|
|
182
|
+
tool_calls = default_decode_execute_prompting(generation)
|
|
183
|
+
|
|
184
|
+
single_turn_decoded_generations.append(tool_calls)
|
|
185
|
+
except Exception:
|
|
186
|
+
single_turn_decoded_generations.append([generation])
|
|
187
|
+
|
|
188
|
+
multi_turn_decoded_generations.append(single_turn_decoded_generations)
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
raw_score_result = multi_turn_checker(
|
|
192
|
+
multi_turn_decoded_generations,
|
|
193
|
+
row['ground_truth'],
|
|
194
|
+
row,
|
|
195
|
+
test_category,
|
|
196
|
+
dummy_model,
|
|
197
|
+
)
|
|
198
|
+
except Exception:
|
|
199
|
+
raw_score_result = {
|
|
200
|
+
'valid': False,
|
|
201
|
+
'error_type': 'multi_turn:checker_failed',
|
|
202
|
+
'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
|
|
203
|
+
}
|
|
196
204
|
|
|
197
|
-
score_result = {
|
|
198
|
-
'valid': float(raw_score_result['valid']),
|
|
199
|
-
'error_message': raw_score_result.get('error_message', ''),
|
|
200
|
-
'error_type': raw_score_result.get('error_type', ''),
|
|
201
|
-
}
|
|
202
|
-
else:
|
|
203
|
-
try:
|
|
204
|
-
if self.is_fc_model:
|
|
205
|
-
decoded_tool_calls = []
|
|
206
|
-
for tool_call in row['generation'][0]:
|
|
207
|
-
name = list(tool_call.keys())[0]
|
|
208
|
-
params = json.loads(tool_call[name])
|
|
209
|
-
decoded_tool_calls.append({name: params})
|
|
210
|
-
else:
|
|
211
|
-
decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
|
|
212
|
-
|
|
213
|
-
score_result = ast_checker(
|
|
214
|
-
row['functions'],
|
|
215
|
-
decoded_tool_calls,
|
|
216
|
-
row['ground_truth'],
|
|
217
|
-
row['language'],
|
|
218
|
-
row['test_category'],
|
|
219
|
-
dummy_model,
|
|
220
|
-
)
|
|
221
|
-
except Exception:
|
|
222
205
|
score_result = {
|
|
223
|
-
'valid':
|
|
224
|
-
'error_message':
|
|
225
|
-
'error_type': '
|
|
206
|
+
'valid': float(raw_score_result['valid']),
|
|
207
|
+
'error_message': raw_score_result.get('error_message', ''),
|
|
208
|
+
'error_type': raw_score_result.get('error_type', ''),
|
|
226
209
|
}
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
210
|
+
else:
|
|
211
|
+
try:
|
|
212
|
+
if self.is_fc_model:
|
|
213
|
+
decoded_tool_calls = []
|
|
214
|
+
for tool_call in row['generation'][0]:
|
|
215
|
+
name = list(tool_call.keys())[0]
|
|
216
|
+
params = tool_call[name]
|
|
217
|
+
decoded_tool_calls.append({name: params})
|
|
218
|
+
else:
|
|
219
|
+
decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
|
|
220
|
+
|
|
221
|
+
score_result = ast_checker(
|
|
222
|
+
row['functions'],
|
|
223
|
+
decoded_tool_calls,
|
|
224
|
+
row['ground_truth'],
|
|
225
|
+
row['language'],
|
|
226
|
+
row['test_category'],
|
|
227
|
+
dummy_model,
|
|
228
|
+
)
|
|
229
|
+
except Exception:
|
|
230
|
+
score_result = {
|
|
231
|
+
'valid': False,
|
|
232
|
+
'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
|
|
233
|
+
'error_type': 'ast_decoder:decoder_failed',
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
score.value = {
|
|
237
|
+
'acc': float(score_result['valid']),
|
|
238
|
+
}
|
|
239
|
+
score.explanation = score_result.get('error_message', 'Evaluation completed')
|
|
240
|
+
score.metadata = {
|
|
241
|
+
'raw_score_result': score_result,
|
|
242
|
+
'test_category': test_category,
|
|
243
|
+
'underscore_to_dot': self.underscore_to_dot,
|
|
244
|
+
'is_fc_model': self.is_fc_model
|
|
245
|
+
}
|
|
246
|
+
score.main_score_name = 'acc'
|
|
247
|
+
|
|
248
|
+
except Exception:
|
|
249
|
+
logger.error(f'Evaluation failed for sample: {task_state.sample_id}\n{traceback.format_exc()}')
|
|
250
|
+
score.value = {'acc': 0.0}
|
|
251
|
+
score.explanation = 'Evaluation failed with an unexpected error.'
|
|
252
|
+
score.metadata = {'error': traceback.format_exc()}
|
|
253
|
+
score.main_score_name = 'acc'
|
|
254
|
+
return score
|