PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} RENAMED Viewed

@@ -1,10 +1,9 @@
-import importlib
 import json
 import re
 import traceback
-from typing import Any, Dict
+from typing import Any, Dict, List
-from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
 from evalscope.api.dataset import Sample
 from evalscope.api.evaluator import TaskState
 from evalscope.api.messages.chat_message import ChatMessageUser
@@ -12,6 +11,8 @@ from evalscope.api.metric import Score
 from evalscope.api.model import Model, ModelOutput
 from evalscope.api.registry import register_benchmark
 from evalscope.constants import Tags
+from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
+from evalscope.utils.import_utils import check_import
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -36,19 +37,25 @@ SUBJECT_MAPPING = {
     'multi_turn_long_context': 'MULTI_TURN'
 }
+BFCL_V3_TO_V4_SUBJECT_MAPPING = {
+    'simple': 'simple_python',
+    'java': 'simple_java',
+    'javascript': 'simple_javascript',
+}
 @register_benchmark(
     BenchmarkMeta(
         name='bfcl_v3',
         pretty_name='BFCL-v3',
-        tags=[Tags.FUNCTION_CALLING],
+        tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
         description='Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive '
         'and executable function call evaluation** '
         'dedicated to assessing Large Language Models\' (LLMs) ability to invoke '
         'functions. Unlike previous evaluations, '
         'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
-        'Need to run `pip install bfcl-eval==2025.6.16` before evaluating. '
-        '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)',
+        'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
+        '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v3.html)',
         dataset_id='AI-ModelScope/bfcl_v3',
         subset_list=list(SUBJECT_MAPPING.keys()),
         metric_list=['acc'],
@@ -59,7 +66,7 @@ SUBJECT_MAPPING = {
         }
     )
 )
-class BFCLAdapter(DefaultDataAdapter):
+class BFCLV3Adapter(AgentAdapter):
     """
     BFCL adapter using the new data processing framework.
     """
@@ -67,14 +74,12 @@ class BFCLAdapter(DefaultDataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        spec = importlib.util.find_spec('bfcl_eval')
-        if spec is None:
-            raise ImportError(
-                '`bfcl_eval` not found, please install it with `pip install bfcl-eval==2025.6.16` before evaluating.'
-            )
+        check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
         self.category_map = SUBJECT_MAPPING
         self.reformat_subset = True
+        self.add_overall_metric = False
+        self.add_aggregation_name = False
         self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
         self.is_fc_model = self.extra_params.get('is_fc_model', True)
@@ -108,8 +113,8 @@ class BFCLAdapter(DefaultDataAdapter):
             record['turns'] = new_turns
         return Sample(
-            input=[ChatMessageUser(content='')],
-            target='',  # Will use the record for evaluation
+            input=[ChatMessageUser(content=json.dumps(record['turns']))],
+            target=json.dumps(record['ground_truth']),  # Will use the record for evaluation
             subset_key=record['subset'],
             metadata=record  # Store the full record for evaluation
         )
@@ -130,6 +135,8 @@ class BFCLAdapter(DefaultDataAdapter):
         )
         from bfcl_eval.utils import is_empty_output
+        from .utils import convert_format_language, convert_language
         score = Score(
             extracted_prediction=filtered_prediction,
             prediction=original_prediction,
@@ -143,7 +150,7 @@ class BFCLAdapter(DefaultDataAdapter):
                 dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
             row = task_state.metadata
-            test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
+            test_category = BFCL_V3_TO_V4_SUBJECT_MAPPING.get(row['test_category'], row['test_category'])
             if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
                 error = None
@@ -155,7 +162,9 @@ class BFCLAdapter(DefaultDataAdapter):
                             params = tool_call[name]
                             decoded_tool_calls.append({name: params})
                     else:
-                        decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
+                        decoded_tool_calls = default_decode_ast_prompting(
+                            row['generation'][0][0], convert_format_language(row['language'])
+                        )
                     # successful decode means valid function call was present
                     contains_func_call = True
@@ -220,14 +229,16 @@ class BFCLAdapter(DefaultDataAdapter):
                             params = tool_call[name]
                             decoded_tool_calls.append({name: params})
                     else:
-                        decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
+                        decoded_tool_calls = default_decode_ast_prompting(
+                            row['generation'][0][0], convert_format_language(row['language'])
+                        )
                     score_result = ast_checker(
                         row['functions'],
                         decoded_tool_calls,
                         row['ground_truth'],
-                        row['language'],
-                        row['test_category'],
+                        convert_language(row['language']),
+                        test_category,
                         dummy_model,
                     )
                 except Exception:
@@ -256,3 +267,104 @@ class BFCLAdapter(DefaultDataAdapter):
             score.metadata = {'error': traceback.format_exc()}
             score.main_score_name = 'acc'
         return score
+    def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
+        """
+        Finalize the report generation process. Calculate the overall score.
+        Track the number of each category.
+        - step1: simple, java, javascript unweighted average as simple_ast
+        - step2.1: simple_ast, multiple, parallel, parallel_multiple unweighted average as ast_non_live
+        - step2.2: live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average as ast_live
+        - step2.3: irrelevance as hallucination_non_live
+        - step2.4: live_irrelevance, live_relevance weighted average as hallucination_live
+        - step2.5: multi_turn_base as multi_turn_base
+        - step2.6: multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average as multi_turn_augmented
+        - step3.1: ast_non_live, hallucination_non_live unweighted average as non_live
+        - step3.2: ast_live, hallucination_live weighted average as live
+        - step3.3: multi_turn_base, multi_turn_augmented unweighted average as multi_turn
+        - step4: non_live, live, multi_turn unweighted average as overall
+        Args:
+            report (Report): The generated evaluation report.
+            output_dir (str): The directory to save the report.
+        Returns:
+            None
+        """  # noqa: E501
+        for metric in report.metrics:
+            # Collect all subsets in a dictionary for easy access
+            subset_dict: Dict[str, Subset] = {}
+            for category in metric.categories:
+                for subset in category.subsets:
+                    subset_dict[subset.name] = subset
+            # Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
+            simple_subsets = ['simple', 'java', 'javascript']
+            simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
+            subset_dict['simple_ast'] = simple_ast
+            # Step 2.1: Calculate ast_non_live
+            # (simple_ast, multiple, parallel, parallel_multiple unweighted average)
+            ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
+            ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
+            subset_dict['ast_non_live'] = ast_non_live
+            # Step 2.2: Calculate ast_live
+            # (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
+            live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
+            ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
+            subset_dict['ast_live'] = ast_live
+            # Step 2.3: hallucination_non_live (irrelevance)
+            if 'irrelevance' in subset_dict:
+                subset_dict['hallucination_non_live'] = subset_dict['irrelevance']
+            else:
+                subset_dict['hallucination_non_live'] = Subset(name='hallucination_non_live', score=0, num=0)
+            # Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
+            hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
+            hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
+            subset_dict['hallucination_live'] = hallucination_live
+            # Step 2.5: multi_turn_base
+            if 'multi_turn_base' not in subset_dict:
+                subset_dict['multi_turn_base'] = Subset(name='multi_turn_base', score=0, num=0)
+            # Step 2.6: Calculate multi_turn_augmented
+            # (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
+            multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
+            multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
+            subset_dict['multi_turn_augmented'] = multi_turn_augmented
+            # Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
+            non_live_subsets = ['ast_non_live', 'hallucination_non_live']
+            non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
+            subset_dict['non_live'] = non_live
+            # Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
+            live_agg_subsets = ['ast_live', 'hallucination_live']
+            live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
+            subset_dict['live'] = live
+            # Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
+            multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
+            multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
+            subset_dict['multi_turn'] = multi_turn
+            # Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
+            overall_subsets = ['non_live', 'live', 'multi_turn']
+            overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
+            subset_dict['overall'] = overall
+            # Add computed scores to the category
+            computed_subset_names = ['non_live', 'live', 'multi_turn', 'overall']
+            # Add the computed scores as new subsets in the metric
+            dummy_subsets = []
+            for subset_name in computed_subset_names:
+                if subset_name in subset_dict:
+                    subset = subset_dict[subset_name]
+                    subset.name = subset_name.upper()
+                    dummy_subsets.append(subset)
+            dummy_category = Category(name='-', subsets=dummy_subsets)
+            metric.categories.append(dummy_category)

evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} RENAMED Viewed

@@ -72,13 +72,14 @@ def generate_turn(model: Model, row: dict[str, Any]):
             # Handle the response based on the model output structure
             message = model_output.message
-            model_usage += model_output.usage
+            if model_output.usage is not None:
+                model_usage += model_output.usage
             current_messages.append(message)
             if isinstance(message, str):
                 result = message
             else:
-                result = message.content
+                result = message.text
             logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
             current_responses.append(result)
@@ -115,7 +116,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
             n_steps += 1
             if n_steps > MAXIMUM_STEP_LIMIT:
-                logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
+                logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
                 break
         all_model_responses.append(current_responses)
@@ -145,9 +146,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
             new_tools = row['missing_functions'][str(turn_idx)]
             for new_tool in new_tools:
                 cur_tool = new_tool[0]
-                # change type to object
-                if cur_tool['parameters']['type'] != 'object':
-                    cur_tool['parameters']['type'] = 'object'
+                cur_tool['parameters']['type'] = 'object'
                 tools.append({
                     'type': 'function',
                     'function': cur_tool,
@@ -172,7 +171,8 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
             # Handle the response based on the model output structure
             message = model_output.message
-            model_usage += model_output.usage
+            if model_output.usage is not None:
+                model_usage += model_output.usage
             current_messages.append(message)
             if isinstance(message, str):
@@ -186,7 +186,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
                     logger.error(f'Error converting tool calls to function call strings: {e}')
                     tool_call_strs = None
             else:
-                model_responses = [message.content]
+                model_responses = [message.text]
                 tool_call_strs = None
             current_responses.extend(model_responses)
@@ -214,7 +214,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
             n_steps += 1
             if n_steps > MAXIMUM_STEP_LIMIT:
-                logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
+                logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
                 break
         all_model_responses.append(current_responses)

evalscope/benchmarks/bfcl/v3/utils.py ADDED Viewed

@@ -0,0 +1,23 @@
+def convert_language(language: str) -> str:
+    """Convert language names from BFCL v3 to BFCL v4 naming conventions."""
+    from bfcl_eval.constants.enums import Language
+    mapping = {
+        'python': Language.PYTHON,
+        'java': Language.JAVA,
+        'javascript': Language.JAVASCRIPT,
+    }
+    return mapping[language.lower()]
+def convert_format_language(format_language: str) -> str:
+    """Convert format language names from BFCL v3 to BFCL v4 naming conventions."""
+    from bfcl_eval.constants.enums import ReturnFormat
+    mapping = {
+        'python': ReturnFormat.PYTHON,
+        'java': ReturnFormat.JAVA,
+        'javascript': ReturnFormat.JAVASCRIPT,
+        'json': ReturnFormat.JSON,
+        'verbose_xml': ReturnFormat.VERBOSE_XML,
+        'concise_xml': ReturnFormat.CONCISE_XML,
+    }
+    return mapping[format_language.lower()]

evalscope/benchmarks/bfcl/v4/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py ADDED Viewed

@@ -0,0 +1,229 @@
+import json
+import os
+import traceback
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict, List
+from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
+from evalscope.api.dataset import Sample
+from evalscope.api.dataset.dataset import DatasetDict
+from evalscope.api.dataset.loader import DictDataLoader
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages.chat_message import ChatMessageUser
+from evalscope.api.metric import Score
+from evalscope.api.model import Model, ModelOutput
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.report import Report
+from evalscope.utils.function_utils import thread_safe
+from evalscope.utils.import_utils import check_import
+from evalscope.utils.logger import get_logger
+from .utils import (
+    ALL_SCORING_CATEGORIES,
+    compute_aggregate_subsets,
+    compute_entry_result,
+    load_bfcl_data,
+    process_test_entries,
+    run_prereq_inference,
+)
+logger = get_logger()
+@register_benchmark(
+    BenchmarkMeta(
+        name='bfcl_v4',
+        pretty_name='BFCL-v4',
+        tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
+        description='With function-calling being the building blocks of Agents, '
+        'the Berkeley Function-Calling Leaderboard (BFCL) V4 presents a holistic agentic '
+        'evaluation for LLMs. BFCL V4 Agentic includes web search, memory, and format sensitivity. '
+        'Together, the ability to web search, read and write from memory, and the ability to invoke '
+        'functions in different languages present the building blocks for the exciting and extremely '
+        'challenging avenues that power agentic LLMs today from deep-research, to agents for coding and law. '
+        'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
+        '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v4.html)',
+        dataset_id='https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard',
+        subset_list=ALL_SCORING_CATEGORIES,
+        metric_list=['acc'],
+        eval_split='train',
+        extra_params={
+            'underscore_to_dot': True,
+            'is_fc_model': True,
+            'SERPAPI_API_KEY': None,
+        }
+    )
+)
+class BFCLV4Adapter(AgentAdapter):
+    """
+    BFCL adapter using the new data processing framework.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
+        self.add_overall_metric = False
+        self.add_aggregation_name = False
+        self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
+        self.is_fc_model = self.extra_params.get('is_fc_model', True)
+        # Set SERPAPI_API_KEY in environment variables if provided
+        serpapi_api_key = self.extra_params.get('SERPAPI_API_KEY', None)
+        if serpapi_api_key:
+            os.environ['SERPAPI_API_KEY'] = serpapi_api_key
+        self.model_result_dir = Path(self._task_config.work_dir) if self._task_config else Path('./bfcl_model_results')
+        self.handler = None
+        self.prereq_entries = []
+        self.prereq_finished = False
+    def load(self):
+        """Load and process the BFCL dataset."""
+        from bfcl_eval.utils import parse_test_category_argument
+        datasets = {}
+        all_test_categories = parse_test_category_argument(self.subset_list)
+        test_entries_by_cat, ground_truth_by_cat = load_bfcl_data(all_test_categories)
+        for category in all_test_categories:
+            test_entries = test_entries_by_cat.get(category, [])
+            ground_truth_entries = ground_truth_by_cat.get(category, [])
+            if not test_entries:
+                continue
+            datasets[category] = self._create_dataset_for_category(category, test_entries, ground_truth_entries)
+        test_dataset = DatasetDict(datasets)
+        return test_dataset, None
+    def _create_dataset_for_category(
+        self, category: str, test_entries: List[Dict], ground_truth_entries: List[Dict]
+    ) -> DatasetDict:
+        """Create a dataset for a single category by merging test and ground truth data."""
+        processed_entries, prereq_entries = process_test_entries(
+            category=category,
+            test_entries=test_entries,
+            ground_truth_entries=ground_truth_entries,
+            model_result_dir=self.model_result_dir,
+        )
+        # collect prereq entries for later prereq inference
+        self.prereq_entries.extend(prereq_entries)
+        return DictDataLoader(
+            dict_list=processed_entries,
+            limit=self.limit,
+            repeats=self.repeats,
+            sample_fields=self.record_to_sample,
+            shuffle=self.shuffle,
+        ).load()
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """Convert a data record to a Sample object."""
+        return Sample(
+            input=[ChatMessageUser(content=json.dumps(record['question']))],
+            target=json.dumps(record['ground_truth']),  # Will use the record for evaluation
+            metadata=record  # Store the full record for evaluation
+        )
+    @thread_safe
+    def _init_handler(self):
+        if self.handler is not None:
+            return  # Handler already initialized
+        from bfcl_eval.model_handler.api_inference.openai_completion import OpenAICompletionsHandler
+        # Set env variables for OpenAI API
+        os.environ['OPENAI_API_KEY'] = self._task_config.api_key
+        os.environ['OPENAI_BASE_URL'] = self._task_config.api_url
+        self.handler = OpenAICompletionsHandler(
+            model_name=self._task_config.model,
+            temperature=self._task_config.generation_config.temperature,
+            registry_name=self._task_config.model_id,
+            is_fc_model=self.is_fc_model,
+        )
+        self._prereq_inference()
+    def _prereq_inference(self):
+        if self.prereq_finished:
+            return
+        # MOVED: delegate prereq processing to utils
+        run_prereq_inference(
+            handler=self.handler,
+            prereq_entries=self.prereq_entries,
+            model_result_dir=self.model_result_dir,
+            batch_size=self._task_config.eval_batch_size,
+            logger=logger,
+        )
+        self.prereq_finished = True
+    def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
+        try:
+            self._init_handler()
+            result, _ = self.handler.inference(
+                deepcopy(sample.metadata), include_input_log=False, exclude_state_log=False
+            )
+            output = ModelOutput.from_content(
+                model=model.name,
+                content=json.dumps(result),
+            )
+        except Exception as e:
+            # This is usually the case when the model getting stuck on one particular test case.
+            # For example, timeout error or FC model returning invalid JSON response.
+            # Since temperature is already set to 0.001, retrying the same test case will not help.
+            # So we continue the generation process and record the error message as the model response
+            logger.error(f'Error during inference for sample ID {sample.metadata.get("id")}: {e}')
+            logger.error(traceback.format_exc())
+            output = ModelOutput.from_content(
+                model=model.name,
+                content=json.dumps({
+                    'error': str(e),
+                    'error_message': traceback.format_exc(),
+                }),
+            )
+        return output
+    def match_score(
+        self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
+    ) -> Score:
+        self._init_handler()
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        model_result = json.loads(filtered_prediction)
+        prompt = task_state.metadata
+        entry_result = compute_entry_result(
+            handler=self.handler,
+            model_result=model_result,
+            prompt_entry=prompt,
+            underscore_to_dot=self.underscore_to_dot,
+        )
+        valid = 1 if entry_result['valid'] else 0
+        score.value = {'acc': valid}
+        score.metadata = {
+            'valid': bool(entry_result.get('valid')),
+            'error': str(entry_result.get('error')),
+            'error_message': str(entry_result.get('error_message')),
+            'error_type': str(entry_result.get('error_type')),
+        }
+        return score
+    def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
+        """
+        Finalize the report generation process. Calculate the overall score.
+        """
+        # noqa: E501
+        # MOVED: delegate aggregation logic to utils
+        compute_aggregate_subsets(report)

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl