evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
import importlib
|
|
2
1
|
import json
|
|
3
2
|
import re
|
|
4
3
|
import traceback
|
|
5
|
-
from typing import Any, Dict
|
|
4
|
+
from typing import Any, Dict, List
|
|
6
5
|
|
|
7
|
-
from evalscope.api.benchmark import
|
|
6
|
+
from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
|
|
8
7
|
from evalscope.api.dataset import Sample
|
|
9
8
|
from evalscope.api.evaluator import TaskState
|
|
10
9
|
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
@@ -12,6 +11,8 @@ from evalscope.api.metric import Score
|
|
|
12
11
|
from evalscope.api.model import Model, ModelOutput
|
|
13
12
|
from evalscope.api.registry import register_benchmark
|
|
14
13
|
from evalscope.constants import Tags
|
|
14
|
+
from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
|
|
15
|
+
from evalscope.utils.import_utils import check_import
|
|
15
16
|
from evalscope.utils.logger import get_logger
|
|
16
17
|
|
|
17
18
|
logger = get_logger()
|
|
@@ -36,19 +37,25 @@ SUBJECT_MAPPING = {
|
|
|
36
37
|
'multi_turn_long_context': 'MULTI_TURN'
|
|
37
38
|
}
|
|
38
39
|
|
|
40
|
+
BFCL_V3_TO_V4_SUBJECT_MAPPING = {
|
|
41
|
+
'simple': 'simple_python',
|
|
42
|
+
'java': 'simple_java',
|
|
43
|
+
'javascript': 'simple_javascript',
|
|
44
|
+
}
|
|
45
|
+
|
|
39
46
|
|
|
40
47
|
@register_benchmark(
|
|
41
48
|
BenchmarkMeta(
|
|
42
49
|
name='bfcl_v3',
|
|
43
50
|
pretty_name='BFCL-v3',
|
|
44
|
-
tags=[Tags.FUNCTION_CALLING],
|
|
51
|
+
tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
|
|
45
52
|
description='Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive '
|
|
46
53
|
'and executable function call evaluation** '
|
|
47
54
|
'dedicated to assessing Large Language Models\' (LLMs) ability to invoke '
|
|
48
55
|
'functions. Unlike previous evaluations, '
|
|
49
56
|
'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
|
|
50
|
-
'Need to run `pip install bfcl-eval==2025.
|
|
51
|
-
'[Usage Example](https://evalscope.readthedocs.io/
|
|
57
|
+
'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
|
|
58
|
+
'[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v3.html)',
|
|
52
59
|
dataset_id='AI-ModelScope/bfcl_v3',
|
|
53
60
|
subset_list=list(SUBJECT_MAPPING.keys()),
|
|
54
61
|
metric_list=['acc'],
|
|
@@ -59,7 +66,7 @@ SUBJECT_MAPPING = {
|
|
|
59
66
|
}
|
|
60
67
|
)
|
|
61
68
|
)
|
|
62
|
-
class
|
|
69
|
+
class BFCLV3Adapter(AgentAdapter):
|
|
63
70
|
"""
|
|
64
71
|
BFCL adapter using the new data processing framework.
|
|
65
72
|
"""
|
|
@@ -67,14 +74,12 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
67
74
|
def __init__(self, **kwargs):
|
|
68
75
|
super().__init__(**kwargs)
|
|
69
76
|
|
|
70
|
-
|
|
71
|
-
if spec is None:
|
|
72
|
-
raise ImportError(
|
|
73
|
-
'`bfcl_eval` not found, please install it with `pip install bfcl-eval==2025.6.16` before evaluating.'
|
|
74
|
-
)
|
|
77
|
+
check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
|
|
75
78
|
|
|
76
79
|
self.category_map = SUBJECT_MAPPING
|
|
77
80
|
self.reformat_subset = True
|
|
81
|
+
self.add_overall_metric = False
|
|
82
|
+
self.add_aggregation_name = False
|
|
78
83
|
|
|
79
84
|
self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
|
|
80
85
|
self.is_fc_model = self.extra_params.get('is_fc_model', True)
|
|
@@ -108,8 +113,8 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
108
113
|
record['turns'] = new_turns
|
|
109
114
|
|
|
110
115
|
return Sample(
|
|
111
|
-
input=[ChatMessageUser(content='')],
|
|
112
|
-
target='', # Will use the record for evaluation
|
|
116
|
+
input=[ChatMessageUser(content=json.dumps(record['turns']))],
|
|
117
|
+
target=json.dumps(record['ground_truth']), # Will use the record for evaluation
|
|
113
118
|
subset_key=record['subset'],
|
|
114
119
|
metadata=record # Store the full record for evaluation
|
|
115
120
|
)
|
|
@@ -130,6 +135,8 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
130
135
|
)
|
|
131
136
|
from bfcl_eval.utils import is_empty_output
|
|
132
137
|
|
|
138
|
+
from .utils import convert_format_language, convert_language
|
|
139
|
+
|
|
133
140
|
score = Score(
|
|
134
141
|
extracted_prediction=filtered_prediction,
|
|
135
142
|
prediction=original_prediction,
|
|
@@ -143,7 +150,7 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
143
150
|
dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
|
|
144
151
|
|
|
145
152
|
row = task_state.metadata
|
|
146
|
-
test_category =
|
|
153
|
+
test_category = BFCL_V3_TO_V4_SUBJECT_MAPPING.get(row['test_category'], row['test_category'])
|
|
147
154
|
|
|
148
155
|
if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
|
|
149
156
|
error = None
|
|
@@ -155,7 +162,9 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
155
162
|
params = tool_call[name]
|
|
156
163
|
decoded_tool_calls.append({name: params})
|
|
157
164
|
else:
|
|
158
|
-
decoded_tool_calls = default_decode_ast_prompting(
|
|
165
|
+
decoded_tool_calls = default_decode_ast_prompting(
|
|
166
|
+
row['generation'][0][0], convert_format_language(row['language'])
|
|
167
|
+
)
|
|
159
168
|
|
|
160
169
|
# successful decode means valid function call was present
|
|
161
170
|
contains_func_call = True
|
|
@@ -220,14 +229,16 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
220
229
|
params = tool_call[name]
|
|
221
230
|
decoded_tool_calls.append({name: params})
|
|
222
231
|
else:
|
|
223
|
-
decoded_tool_calls = default_decode_ast_prompting(
|
|
232
|
+
decoded_tool_calls = default_decode_ast_prompting(
|
|
233
|
+
row['generation'][0][0], convert_format_language(row['language'])
|
|
234
|
+
)
|
|
224
235
|
|
|
225
236
|
score_result = ast_checker(
|
|
226
237
|
row['functions'],
|
|
227
238
|
decoded_tool_calls,
|
|
228
239
|
row['ground_truth'],
|
|
229
|
-
row['language'],
|
|
230
|
-
|
|
240
|
+
convert_language(row['language']),
|
|
241
|
+
test_category,
|
|
231
242
|
dummy_model,
|
|
232
243
|
)
|
|
233
244
|
except Exception:
|
|
@@ -256,3 +267,104 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
256
267
|
score.metadata = {'error': traceback.format_exc()}
|
|
257
268
|
score.main_score_name = 'acc'
|
|
258
269
|
return score
|
|
270
|
+
|
|
271
|
+
def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
|
|
272
|
+
"""
|
|
273
|
+
Finalize the report generation process. Calculate the overall score.
|
|
274
|
+
|
|
275
|
+
Track the number of each category.
|
|
276
|
+
- step1: simple, java, javascript unweighted average as simple_ast
|
|
277
|
+
- step2.1: simple_ast, multiple, parallel, parallel_multiple unweighted average as ast_non_live
|
|
278
|
+
- step2.2: live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average as ast_live
|
|
279
|
+
- step2.3: irrelevance as hallucination_non_live
|
|
280
|
+
- step2.4: live_irrelevance, live_relevance weighted average as hallucination_live
|
|
281
|
+
- step2.5: multi_turn_base as multi_turn_base
|
|
282
|
+
- step2.6: multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average as multi_turn_augmented
|
|
283
|
+
- step3.1: ast_non_live, hallucination_non_live unweighted average as non_live
|
|
284
|
+
- step3.2: ast_live, hallucination_live weighted average as live
|
|
285
|
+
- step3.3: multi_turn_base, multi_turn_augmented unweighted average as multi_turn
|
|
286
|
+
- step4: non_live, live, multi_turn unweighted average as overall
|
|
287
|
+
Args:
|
|
288
|
+
report (Report): The generated evaluation report.
|
|
289
|
+
output_dir (str): The directory to save the report.
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
None
|
|
293
|
+
""" # noqa: E501
|
|
294
|
+
for metric in report.metrics:
|
|
295
|
+
# Collect all subsets in a dictionary for easy access
|
|
296
|
+
subset_dict: Dict[str, Subset] = {}
|
|
297
|
+
for category in metric.categories:
|
|
298
|
+
for subset in category.subsets:
|
|
299
|
+
subset_dict[subset.name] = subset
|
|
300
|
+
|
|
301
|
+
# Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
|
|
302
|
+
simple_subsets = ['simple', 'java', 'javascript']
|
|
303
|
+
simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
|
|
304
|
+
subset_dict['simple_ast'] = simple_ast
|
|
305
|
+
|
|
306
|
+
# Step 2.1: Calculate ast_non_live
|
|
307
|
+
# (simple_ast, multiple, parallel, parallel_multiple unweighted average)
|
|
308
|
+
ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
|
|
309
|
+
ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
|
|
310
|
+
subset_dict['ast_non_live'] = ast_non_live
|
|
311
|
+
|
|
312
|
+
# Step 2.2: Calculate ast_live
|
|
313
|
+
# (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
|
|
314
|
+
live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
|
|
315
|
+
ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
|
|
316
|
+
subset_dict['ast_live'] = ast_live
|
|
317
|
+
|
|
318
|
+
# Step 2.3: hallucination_non_live (irrelevance)
|
|
319
|
+
if 'irrelevance' in subset_dict:
|
|
320
|
+
subset_dict['hallucination_non_live'] = subset_dict['irrelevance']
|
|
321
|
+
else:
|
|
322
|
+
subset_dict['hallucination_non_live'] = Subset(name='hallucination_non_live', score=0, num=0)
|
|
323
|
+
|
|
324
|
+
# Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
|
|
325
|
+
hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
|
|
326
|
+
hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
|
|
327
|
+
subset_dict['hallucination_live'] = hallucination_live
|
|
328
|
+
|
|
329
|
+
# Step 2.5: multi_turn_base
|
|
330
|
+
if 'multi_turn_base' not in subset_dict:
|
|
331
|
+
subset_dict['multi_turn_base'] = Subset(name='multi_turn_base', score=0, num=0)
|
|
332
|
+
|
|
333
|
+
# Step 2.6: Calculate multi_turn_augmented
|
|
334
|
+
# (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
|
|
335
|
+
multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
|
|
336
|
+
multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
|
|
337
|
+
subset_dict['multi_turn_augmented'] = multi_turn_augmented
|
|
338
|
+
|
|
339
|
+
# Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
|
|
340
|
+
non_live_subsets = ['ast_non_live', 'hallucination_non_live']
|
|
341
|
+
non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
|
|
342
|
+
subset_dict['non_live'] = non_live
|
|
343
|
+
|
|
344
|
+
# Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
|
|
345
|
+
live_agg_subsets = ['ast_live', 'hallucination_live']
|
|
346
|
+
live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
|
|
347
|
+
subset_dict['live'] = live
|
|
348
|
+
|
|
349
|
+
# Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
|
|
350
|
+
multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
|
|
351
|
+
multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
|
|
352
|
+
subset_dict['multi_turn'] = multi_turn
|
|
353
|
+
|
|
354
|
+
# Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
|
|
355
|
+
overall_subsets = ['non_live', 'live', 'multi_turn']
|
|
356
|
+
overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
|
|
357
|
+
subset_dict['overall'] = overall
|
|
358
|
+
|
|
359
|
+
# Add computed scores to the category
|
|
360
|
+
computed_subset_names = ['non_live', 'live', 'multi_turn', 'overall']
|
|
361
|
+
|
|
362
|
+
# Add the computed scores as new subsets in the metric
|
|
363
|
+
dummy_subsets = []
|
|
364
|
+
for subset_name in computed_subset_names:
|
|
365
|
+
if subset_name in subset_dict:
|
|
366
|
+
subset = subset_dict[subset_name]
|
|
367
|
+
subset.name = subset_name.upper()
|
|
368
|
+
dummy_subsets.append(subset)
|
|
369
|
+
dummy_category = Category(name='-', subsets=dummy_subsets)
|
|
370
|
+
metric.categories.append(dummy_category)
|
|
@@ -72,13 +72,14 @@ def generate_turn(model: Model, row: dict[str, Any]):
|
|
|
72
72
|
|
|
73
73
|
# Handle the response based on the model output structure
|
|
74
74
|
message = model_output.message
|
|
75
|
-
|
|
75
|
+
if model_output.usage is not None:
|
|
76
|
+
model_usage += model_output.usage
|
|
76
77
|
|
|
77
78
|
current_messages.append(message)
|
|
78
79
|
if isinstance(message, str):
|
|
79
80
|
result = message
|
|
80
81
|
else:
|
|
81
|
-
result = message.
|
|
82
|
+
result = message.text
|
|
82
83
|
|
|
83
84
|
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
|
|
84
85
|
current_responses.append(result)
|
|
@@ -115,7 +116,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
|
|
|
115
116
|
|
|
116
117
|
n_steps += 1
|
|
117
118
|
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
118
|
-
logger.
|
|
119
|
+
logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
119
120
|
break
|
|
120
121
|
|
|
121
122
|
all_model_responses.append(current_responses)
|
|
@@ -145,9 +146,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
145
146
|
new_tools = row['missing_functions'][str(turn_idx)]
|
|
146
147
|
for new_tool in new_tools:
|
|
147
148
|
cur_tool = new_tool[0]
|
|
148
|
-
|
|
149
|
-
if cur_tool['parameters']['type'] != 'object':
|
|
150
|
-
cur_tool['parameters']['type'] = 'object'
|
|
149
|
+
cur_tool['parameters']['type'] = 'object'
|
|
151
150
|
tools.append({
|
|
152
151
|
'type': 'function',
|
|
153
152
|
'function': cur_tool,
|
|
@@ -172,7 +171,8 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
172
171
|
|
|
173
172
|
# Handle the response based on the model output structure
|
|
174
173
|
message = model_output.message
|
|
175
|
-
|
|
174
|
+
if model_output.usage is not None:
|
|
175
|
+
model_usage += model_output.usage
|
|
176
176
|
|
|
177
177
|
current_messages.append(message)
|
|
178
178
|
if isinstance(message, str):
|
|
@@ -186,7 +186,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
186
186
|
logger.error(f'Error converting tool calls to function call strings: {e}')
|
|
187
187
|
tool_call_strs = None
|
|
188
188
|
else:
|
|
189
|
-
model_responses = [message.
|
|
189
|
+
model_responses = [message.text]
|
|
190
190
|
tool_call_strs = None
|
|
191
191
|
|
|
192
192
|
current_responses.extend(model_responses)
|
|
@@ -214,7 +214,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
214
214
|
|
|
215
215
|
n_steps += 1
|
|
216
216
|
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
217
|
-
logger.
|
|
217
|
+
logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
218
218
|
break
|
|
219
219
|
|
|
220
220
|
all_model_responses.append(current_responses)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
def convert_language(language: str) -> str:
|
|
2
|
+
"""Convert language names from BFCL v3 to BFCL v4 naming conventions."""
|
|
3
|
+
from bfcl_eval.constants.enums import Language
|
|
4
|
+
mapping = {
|
|
5
|
+
'python': Language.PYTHON,
|
|
6
|
+
'java': Language.JAVA,
|
|
7
|
+
'javascript': Language.JAVASCRIPT,
|
|
8
|
+
}
|
|
9
|
+
return mapping[language.lower()]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_format_language(format_language: str) -> str:
|
|
13
|
+
"""Convert format language names from BFCL v3 to BFCL v4 naming conventions."""
|
|
14
|
+
from bfcl_eval.constants.enums import ReturnFormat
|
|
15
|
+
mapping = {
|
|
16
|
+
'python': ReturnFormat.PYTHON,
|
|
17
|
+
'java': ReturnFormat.JAVA,
|
|
18
|
+
'javascript': ReturnFormat.JAVASCRIPT,
|
|
19
|
+
'json': ReturnFormat.JSON,
|
|
20
|
+
'verbose_xml': ReturnFormat.VERBOSE_XML,
|
|
21
|
+
'concise_xml': ReturnFormat.CONCISE_XML,
|
|
22
|
+
}
|
|
23
|
+
return mapping[format_language.lower()]
|
|
File without changes
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import traceback
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List
|
|
7
|
+
|
|
8
|
+
from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
|
|
9
|
+
from evalscope.api.dataset import Sample
|
|
10
|
+
from evalscope.api.dataset.dataset import DatasetDict
|
|
11
|
+
from evalscope.api.dataset.loader import DictDataLoader
|
|
12
|
+
from evalscope.api.evaluator import TaskState
|
|
13
|
+
from evalscope.api.messages.chat_message import ChatMessageUser
|
|
14
|
+
from evalscope.api.metric import Score
|
|
15
|
+
from evalscope.api.model import Model, ModelOutput
|
|
16
|
+
from evalscope.api.registry import register_benchmark
|
|
17
|
+
from evalscope.constants import Tags
|
|
18
|
+
from evalscope.report import Report
|
|
19
|
+
from evalscope.utils.function_utils import thread_safe
|
|
20
|
+
from evalscope.utils.import_utils import check_import
|
|
21
|
+
from evalscope.utils.logger import get_logger
|
|
22
|
+
from .utils import (
|
|
23
|
+
ALL_SCORING_CATEGORIES,
|
|
24
|
+
compute_aggregate_subsets,
|
|
25
|
+
compute_entry_result,
|
|
26
|
+
load_bfcl_data,
|
|
27
|
+
process_test_entries,
|
|
28
|
+
run_prereq_inference,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
logger = get_logger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@register_benchmark(
|
|
35
|
+
BenchmarkMeta(
|
|
36
|
+
name='bfcl_v4',
|
|
37
|
+
pretty_name='BFCL-v4',
|
|
38
|
+
tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
|
|
39
|
+
description='With function-calling being the building blocks of Agents, '
|
|
40
|
+
'the Berkeley Function-Calling Leaderboard (BFCL) V4 presents a holistic agentic '
|
|
41
|
+
'evaluation for LLMs. BFCL V4 Agentic includes web search, memory, and format sensitivity. '
|
|
42
|
+
'Together, the ability to web search, read and write from memory, and the ability to invoke '
|
|
43
|
+
'functions in different languages present the building blocks for the exciting and extremely '
|
|
44
|
+
'challenging avenues that power agentic LLMs today from deep-research, to agents for coding and law. '
|
|
45
|
+
'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
|
|
46
|
+
'[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v4.html)',
|
|
47
|
+
dataset_id='https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard',
|
|
48
|
+
subset_list=ALL_SCORING_CATEGORIES,
|
|
49
|
+
metric_list=['acc'],
|
|
50
|
+
eval_split='train',
|
|
51
|
+
extra_params={
|
|
52
|
+
'underscore_to_dot': True,
|
|
53
|
+
'is_fc_model': True,
|
|
54
|
+
'SERPAPI_API_KEY': None,
|
|
55
|
+
}
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
class BFCLV4Adapter(AgentAdapter):
|
|
59
|
+
"""
|
|
60
|
+
BFCL adapter using the new data processing framework.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, **kwargs):
|
|
64
|
+
super().__init__(**kwargs)
|
|
65
|
+
|
|
66
|
+
check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
|
|
67
|
+
|
|
68
|
+
self.add_overall_metric = False
|
|
69
|
+
self.add_aggregation_name = False
|
|
70
|
+
|
|
71
|
+
self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
|
|
72
|
+
self.is_fc_model = self.extra_params.get('is_fc_model', True)
|
|
73
|
+
# Set SERPAPI_API_KEY in environment variables if provided
|
|
74
|
+
serpapi_api_key = self.extra_params.get('SERPAPI_API_KEY', None)
|
|
75
|
+
if serpapi_api_key:
|
|
76
|
+
os.environ['SERPAPI_API_KEY'] = serpapi_api_key
|
|
77
|
+
self.model_result_dir = Path(self._task_config.work_dir) if self._task_config else Path('./bfcl_model_results')
|
|
78
|
+
self.handler = None
|
|
79
|
+
self.prereq_entries = []
|
|
80
|
+
self.prereq_finished = False
|
|
81
|
+
|
|
82
|
+
def load(self):
|
|
83
|
+
"""Load and process the BFCL dataset."""
|
|
84
|
+
from bfcl_eval.utils import parse_test_category_argument
|
|
85
|
+
datasets = {}
|
|
86
|
+
all_test_categories = parse_test_category_argument(self.subset_list)
|
|
87
|
+
|
|
88
|
+
test_entries_by_cat, ground_truth_by_cat = load_bfcl_data(all_test_categories)
|
|
89
|
+
|
|
90
|
+
for category in all_test_categories:
|
|
91
|
+
test_entries = test_entries_by_cat.get(category, [])
|
|
92
|
+
ground_truth_entries = ground_truth_by_cat.get(category, [])
|
|
93
|
+
|
|
94
|
+
if not test_entries:
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
datasets[category] = self._create_dataset_for_category(category, test_entries, ground_truth_entries)
|
|
98
|
+
|
|
99
|
+
test_dataset = DatasetDict(datasets)
|
|
100
|
+
return test_dataset, None
|
|
101
|
+
|
|
102
|
+
def _create_dataset_for_category(
|
|
103
|
+
self, category: str, test_entries: List[Dict], ground_truth_entries: List[Dict]
|
|
104
|
+
) -> DatasetDict:
|
|
105
|
+
"""Create a dataset for a single category by merging test and ground truth data."""
|
|
106
|
+
processed_entries, prereq_entries = process_test_entries(
|
|
107
|
+
category=category,
|
|
108
|
+
test_entries=test_entries,
|
|
109
|
+
ground_truth_entries=ground_truth_entries,
|
|
110
|
+
model_result_dir=self.model_result_dir,
|
|
111
|
+
)
|
|
112
|
+
# collect prereq entries for later prereq inference
|
|
113
|
+
self.prereq_entries.extend(prereq_entries)
|
|
114
|
+
|
|
115
|
+
return DictDataLoader(
|
|
116
|
+
dict_list=processed_entries,
|
|
117
|
+
limit=self.limit,
|
|
118
|
+
repeats=self.repeats,
|
|
119
|
+
sample_fields=self.record_to_sample,
|
|
120
|
+
shuffle=self.shuffle,
|
|
121
|
+
).load()
|
|
122
|
+
|
|
123
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
124
|
+
"""Convert a data record to a Sample object."""
|
|
125
|
+
return Sample(
|
|
126
|
+
input=[ChatMessageUser(content=json.dumps(record['question']))],
|
|
127
|
+
target=json.dumps(record['ground_truth']), # Will use the record for evaluation
|
|
128
|
+
metadata=record # Store the full record for evaluation
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
@thread_safe
|
|
132
|
+
def _init_handler(self):
|
|
133
|
+
if self.handler is not None:
|
|
134
|
+
return # Handler already initialized
|
|
135
|
+
|
|
136
|
+
from bfcl_eval.model_handler.api_inference.openai_completion import OpenAICompletionsHandler
|
|
137
|
+
|
|
138
|
+
# Set env variables for OpenAI API
|
|
139
|
+
os.environ['OPENAI_API_KEY'] = self._task_config.api_key
|
|
140
|
+
os.environ['OPENAI_BASE_URL'] = self._task_config.api_url
|
|
141
|
+
|
|
142
|
+
self.handler = OpenAICompletionsHandler(
|
|
143
|
+
model_name=self._task_config.model,
|
|
144
|
+
temperature=self._task_config.generation_config.temperature,
|
|
145
|
+
registry_name=self._task_config.model_id,
|
|
146
|
+
is_fc_model=self.is_fc_model,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
self._prereq_inference()
|
|
150
|
+
|
|
151
|
+
def _prereq_inference(self):
|
|
152
|
+
if self.prereq_finished:
|
|
153
|
+
return
|
|
154
|
+
# MOVED: delegate prereq processing to utils
|
|
155
|
+
run_prereq_inference(
|
|
156
|
+
handler=self.handler,
|
|
157
|
+
prereq_entries=self.prereq_entries,
|
|
158
|
+
model_result_dir=self.model_result_dir,
|
|
159
|
+
batch_size=self._task_config.eval_batch_size,
|
|
160
|
+
logger=logger,
|
|
161
|
+
)
|
|
162
|
+
self.prereq_finished = True
|
|
163
|
+
|
|
164
|
+
def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
|
|
165
|
+
try:
|
|
166
|
+
self._init_handler()
|
|
167
|
+
|
|
168
|
+
result, _ = self.handler.inference(
|
|
169
|
+
deepcopy(sample.metadata), include_input_log=False, exclude_state_log=False
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
output = ModelOutput.from_content(
|
|
173
|
+
model=model.name,
|
|
174
|
+
content=json.dumps(result),
|
|
175
|
+
)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
# This is usually the case when the model getting stuck on one particular test case.
|
|
178
|
+
# For example, timeout error or FC model returning invalid JSON response.
|
|
179
|
+
# Since temperature is already set to 0.001, retrying the same test case will not help.
|
|
180
|
+
# So we continue the generation process and record the error message as the model response
|
|
181
|
+
logger.error(f'Error during inference for sample ID {sample.metadata.get("id")}: {e}')
|
|
182
|
+
logger.error(traceback.format_exc())
|
|
183
|
+
|
|
184
|
+
output = ModelOutput.from_content(
|
|
185
|
+
model=model.name,
|
|
186
|
+
content=json.dumps({
|
|
187
|
+
'error': str(e),
|
|
188
|
+
'error_message': traceback.format_exc(),
|
|
189
|
+
}),
|
|
190
|
+
)
|
|
191
|
+
return output
|
|
192
|
+
|
|
193
|
+
def match_score(
|
|
194
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
195
|
+
) -> Score:
|
|
196
|
+
self._init_handler()
|
|
197
|
+
|
|
198
|
+
score = Score(
|
|
199
|
+
extracted_prediction=filtered_prediction,
|
|
200
|
+
prediction=original_prediction,
|
|
201
|
+
)
|
|
202
|
+
model_result = json.loads(filtered_prediction)
|
|
203
|
+
prompt = task_state.metadata
|
|
204
|
+
|
|
205
|
+
entry_result = compute_entry_result(
|
|
206
|
+
handler=self.handler,
|
|
207
|
+
model_result=model_result,
|
|
208
|
+
prompt_entry=prompt,
|
|
209
|
+
underscore_to_dot=self.underscore_to_dot,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
valid = 1 if entry_result['valid'] else 0
|
|
213
|
+
score.value = {'acc': valid}
|
|
214
|
+
score.metadata = {
|
|
215
|
+
'valid': bool(entry_result.get('valid')),
|
|
216
|
+
'error': str(entry_result.get('error')),
|
|
217
|
+
'error_message': str(entry_result.get('error_message')),
|
|
218
|
+
'error_type': str(entry_result.get('error_type')),
|
|
219
|
+
}
|
|
220
|
+
return score
|
|
221
|
+
|
|
222
|
+
def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
|
|
223
|
+
"""
|
|
224
|
+
Finalize the report generation process. Calculate the overall score.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
# noqa: E501
|
|
228
|
+
# MOVED: delegate aggregation logic to utils
|
|
229
|
+
compute_aggregate_subsets(report)
|