evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,17 @@
1
- import copy
2
1
  import importlib
3
2
  import json
4
3
  import re
5
4
  import traceback
6
- from typing import Any, List
5
+ from typing import Any, Dict
7
6
 
8
- from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.constants import EvalType
7
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
8
+ from evalscope.api.dataset import Sample
9
+ from evalscope.api.evaluator import TaskState
10
+ from evalscope.api.messages.chat_message import ChatMessageUser
11
+ from evalscope.api.metric import Score
12
+ from evalscope.api.model import Model, ModelOutput
13
+ from evalscope.api.registry import register_benchmark
14
+ from evalscope.constants import Tags
10
15
  from evalscope.utils.logger import get_logger
11
16
 
12
17
  logger = get_logger()
@@ -32,28 +37,32 @@ SUBJECT_MAPPING = {
32
37
  }
33
38
 
34
39
 
35
- @Benchmark.register(
36
- name='bfcl_v3',
37
- pretty_name='BFCL-v3',
38
- tags=['Agent', 'Function Calling'],
39
- description=
40
- 'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
41
- 'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
42
- 'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
43
- 'Need to run `pip install bfcl-eval` before evaluating. '
44
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)', # noqa: E501
45
- dataset_id='AI-ModelScope/bfcl_v3',
46
- subset_list=list(SUBJECT_MAPPING.keys()),
47
- model_adapter='bfcl_server',
48
- metric_list=['AverageAccuracy'],
49
- few_shot_num=0,
50
- train_split=None,
51
- eval_split='train',
52
- extra_params={
53
- 'underscore_to_dot': True,
54
- 'is_fc_model': True,
55
- })
56
- class BFCLAdapter(DataAdapter):
40
+ @register_benchmark(
41
+ BenchmarkMeta(
42
+ name='bfcl_v3',
43
+ pretty_name='BFCL-v3',
44
+ tags=[Tags.FUNCTION_CALLING],
45
+ description='Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive '
46
+ 'and executable function call evaluation** '
47
+ 'dedicated to assessing Large Language Models\' (LLMs) ability to invoke '
48
+ 'functions. Unlike previous evaluations, '
49
+ 'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
50
+ 'Need to run `pip install bfcl-eval==2025.6.16` before evaluating. '
51
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)',
52
+ dataset_id='AI-ModelScope/bfcl_v3',
53
+ subset_list=list(SUBJECT_MAPPING.keys()),
54
+ metric_list=['acc'],
55
+ eval_split='train',
56
+ extra_params={
57
+ 'underscore_to_dot': True,
58
+ 'is_fc_model': True,
59
+ }
60
+ )
61
+ )
62
+ class BFCLAdapter(DefaultDataAdapter):
63
+ """
64
+ BFCL adapter using the new data processing framework.
65
+ """
57
66
 
58
67
  def __init__(self, **kwargs):
59
68
  super().__init__(**kwargs)
@@ -61,18 +70,14 @@ class BFCLAdapter(DataAdapter):
61
70
  spec = importlib.util.find_spec('bfcl_eval')
62
71
  if spec is None:
63
72
  raise ImportError(
64
- '`bfcl_eval` not found, please install it with `pip install bfcl-eval` before evaluating.')
73
+ '`bfcl_eval` not found, please install it with `pip install bfcl-eval==2025.6.16` before evaluating.'
74
+ )
65
75
 
66
76
  self.category_map = SUBJECT_MAPPING
77
+ self.reformat_subset = True
67
78
 
68
- extra_params = kwargs.get('extra_params', {})
69
- self.underscore_to_dot = extra_params.get('underscore_to_dot', False)
70
- self.is_fc_model = extra_params.get('is_fc_model', True)
71
-
72
- def load(self, **kwargs):
73
- kwargs['subset_list'] = ['default']
74
- data_dict = super().load(**kwargs)
75
- return self.reformat_subset(data_dict, subset_key='subset', format='{}')
79
+ self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
80
+ self.is_fc_model = self.extra_params.get('is_fc_model', True)
76
81
 
77
82
  def preprocess_row(self, row: dict):
78
83
  """
@@ -87,151 +92,167 @@ class BFCLAdapter(DataAdapter):
87
92
  row['initial_config'] = json.loads(row['initial_config'])
88
93
  row['is_fc_model'] = self.is_fc_model
89
94
 
90
- def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
91
- self.preprocess_row(input_d)
95
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
96
+ """Convert a data record to a Sample object."""
97
+ self.preprocess_row(record)
92
98
 
93
99
  # If the model is a function calling model, we need to remove the system prompt
94
100
  if self.is_fc_model:
95
- turns = input_d['turns']
101
+ turns = record['turns']
96
102
  new_turns = []
97
103
  for turn_idx, messages in enumerate(turns):
98
104
  current_messages = messages.copy()
99
105
  if len(current_messages) > 0 and current_messages[0]['role'] == 'system':
100
106
  current_messages = current_messages[1:]
101
107
  new_turns.append(current_messages)
102
- input_d['turns'] = new_turns
103
-
104
- return self.gen_prompt_data(prompt='', messages=input_d)
108
+ record['turns'] = new_turns
105
109
 
106
- def get_gold_answer(self, input_d: dict) -> str:
107
- # Get the gold choice
108
- return input_d.get('ground_truth', )
110
+ return Sample(
111
+ input=[ChatMessageUser(content='')],
112
+ target='', # Will use the record for evaluation
113
+ subset_key=record['subset'],
114
+ metadata=record # Store the full record for evaluation
115
+ )
109
116
 
110
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> dict:
111
- row = copy.deepcopy(raw_input_d)
112
- del row['turns'] # Remove turns as they are not needed for the match function
117
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
118
+ from .generation import predict
119
+ return predict(model, sample)
113
120
 
114
- row['generation'] = result
115
- return row
116
-
117
- def match(self, gold: dict, pred: dict) -> dict:
121
+ def match_score(
122
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
123
+ ) -> Score:
118
124
  from bfcl_eval.eval_checker.ast_eval.ast_checker import ast_checker
119
125
  from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import multi_turn_checker
120
- from bfcl_eval.model_handler.utils import (convert_to_function_call, default_decode_ast_prompting,
121
- default_decode_execute_prompting)
126
+ from bfcl_eval.model_handler.utils import (
127
+ convert_to_function_call,
128
+ default_decode_ast_prompting,
129
+ default_decode_execute_prompting,
130
+ )
122
131
  from bfcl_eval.utils import is_empty_output
123
132
 
124
- # NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
125
- # which decides if model was provided with functions of the type
126
- # spotify.list_songs or spotify_list_songs
127
- # It is False for all llama models (when using via prompting)
128
- # and True for API calls
129
- if self.underscore_to_dot:
130
- dummy_model = 'gpt-4o-2024-11-20-FC'
131
- else:
132
- dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
133
-
134
- row = pred
135
- test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
136
- if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
137
- error = None
138
- try:
139
- if self.is_fc_model:
140
- decoded_tool_calls = []
141
- for tool_call in row['generation'][0]:
142
- name = list(tool_call.keys())[0]
143
- params = json.loads(tool_call[name])
144
- decoded_tool_calls.append({name: params})
145
- else:
146
- decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
147
-
148
- # successful decode means valid function call was present
149
- contains_func_call = True
150
- if is_empty_output(decoded_tool_calls):
151
- # Empty output is not considered as a valid function call
133
+ score = Score(
134
+ extracted_prediction=filtered_prediction,
135
+ prediction=original_prediction,
136
+ )
137
+
138
+ try:
139
+ # NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
140
+ if self.underscore_to_dot:
141
+ dummy_model = 'gpt-4o-2024-11-20-FC'
142
+ else:
143
+ dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
144
+
145
+ row = task_state.metadata
146
+ test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
147
+
148
+ if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
149
+ error = None
150
+ try:
151
+ if self.is_fc_model:
152
+ decoded_tool_calls = []
153
+ for tool_call in row['generation'][0]:
154
+ name = list(tool_call.keys())[0]
155
+ params = tool_call[name]
156
+ decoded_tool_calls.append({name: params})
157
+ else:
158
+ decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
159
+
160
+ # successful decode means valid function call was present
161
+ contains_func_call = True
162
+ if is_empty_output(decoded_tool_calls):
163
+ # Empty output is not considered as a valid function call
164
+ contains_func_call = False
165
+ error = 'Empty decoded output.'
166
+ except Exception:
152
167
  contains_func_call = False
153
- error = 'Empty decoded output.'
154
- except Exception:
155
- contains_func_call = False
156
- error = f'Failed to decode with traceback: {traceback.format_exc()}'
157
- finally:
158
- valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
159
- score_result = {'valid': valid, 'error_message': error}
160
-
161
- elif row['multi_turn']:
162
- # each step might give a list of tool calls and each turn is multi-step
163
- # and multi-turn has generations of all the turns
164
- # hence in a multi-turn setting,
165
- # multi_turn_decoded_generations is a list of list of list of strings
166
- multi_turn_decoded_generations: list[list[list[str]]] = []
167
- for single_turn_generations in row['generation']:
168
- single_turn_decoded_generations: list[list[str]] = []
169
- for generation in single_turn_generations:
170
- try:
171
- if self.is_fc_model:
172
- tool_calls = convert_to_function_call(generation)
173
- else:
174
- tool_calls = default_decode_execute_prompting(generation)
175
-
176
- single_turn_decoded_generations.append(tool_calls)
177
- except Exception:
178
- single_turn_decoded_generations.append([generation])
179
-
180
- multi_turn_decoded_generations.append(single_turn_decoded_generations)
181
-
182
- try:
183
- raw_score_result = multi_turn_checker(
184
- multi_turn_decoded_generations,
185
- row['ground_truth'],
186
- row,
187
- test_category,
188
- dummy_model,
189
- )
190
- except Exception:
191
- raw_score_result = {
192
- 'valid': False,
193
- 'error_type': 'multi_turn:checker_failed',
194
- 'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
195
- }
168
+ error = f'Failed to decode with traceback: {traceback.format_exc()}'
169
+ finally:
170
+ valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
171
+ score_result = {'valid': valid, 'error_message': error}
172
+
173
+ elif row['multi_turn']:
174
+ # each step might give a list of tool calls and each turn is multi-step
175
+ # and multi-turn has generations of all the turns
176
+ # hence in a multi-turn setting,
177
+ # multi_turn_decoded_generations is a list of list of list of strings
178
+ multi_turn_decoded_generations: list[list[list[str]]] = []
179
+ for single_turn_generations in row['generation']:
180
+ single_turn_decoded_generations: list[list[str]] = []
181
+ for generation in single_turn_generations:
182
+ try:
183
+ if self.is_fc_model:
184
+ tool_calls = convert_to_function_call(generation)
185
+ else:
186
+ tool_calls = default_decode_execute_prompting(generation)
187
+
188
+ single_turn_decoded_generations.append(tool_calls)
189
+ except Exception:
190
+ single_turn_decoded_generations.append([generation])
191
+
192
+ multi_turn_decoded_generations.append(single_turn_decoded_generations)
193
+
194
+ try:
195
+ raw_score_result = multi_turn_checker(
196
+ multi_turn_decoded_generations,
197
+ row['ground_truth'],
198
+ row,
199
+ test_category,
200
+ dummy_model,
201
+ )
202
+ except Exception:
203
+ raw_score_result = {
204
+ 'valid': False,
205
+ 'error_type': 'multi_turn:checker_failed',
206
+ 'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
207
+ }
196
208
 
197
- score_result = {
198
- 'valid': float(raw_score_result['valid']),
199
- 'error_message': raw_score_result.get('error_message', ''),
200
- 'error_type': raw_score_result.get('error_type', ''),
201
- }
202
- else:
203
- try:
204
- if self.is_fc_model:
205
- decoded_tool_calls = []
206
- for tool_call in row['generation'][0]:
207
- name = list(tool_call.keys())[0]
208
- params = json.loads(tool_call[name])
209
- decoded_tool_calls.append({name: params})
210
- else:
211
- decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
212
-
213
- score_result = ast_checker(
214
- row['functions'],
215
- decoded_tool_calls,
216
- row['ground_truth'],
217
- row['language'],
218
- row['test_category'],
219
- dummy_model,
220
- )
221
- except Exception:
222
209
  score_result = {
223
- 'valid': False,
224
- 'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
225
- 'error_type': 'ast_decoder:decoder_failed',
210
+ 'valid': float(raw_score_result['valid']),
211
+ 'error_message': raw_score_result.get('error_message', ''),
212
+ 'error_type': raw_score_result.get('error_type', ''),
226
213
  }
214
+ else:
215
+ try:
216
+ if self.is_fc_model:
217
+ decoded_tool_calls = []
218
+ for tool_call in row['generation'][0]:
219
+ name = list(tool_call.keys())[0]
220
+ params = tool_call[name]
221
+ decoded_tool_calls.append({name: params})
222
+ else:
223
+ decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
227
224
 
228
- return {
229
- 'AverageAccuracy': float(score_result['valid']),
230
- 'raw_score_result': score_result,
231
- }
225
+ score_result = ast_checker(
226
+ row['functions'],
227
+ decoded_tool_calls,
228
+ row['ground_truth'],
229
+ row['language'],
230
+ row['test_category'],
231
+ dummy_model,
232
+ )
233
+ except Exception:
234
+ score_result = {
235
+ 'valid': False,
236
+ 'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
237
+ 'error_type': 'ast_decoder:decoder_failed',
238
+ }
232
239
 
233
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
234
- # aggregate review results
235
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
240
+ score.value = {
241
+ 'acc': float(score_result['valid']),
242
+ }
243
+ score.explanation = score_result.get('error_message', 'Evaluation completed')
244
+ score.metadata = {
245
+ 'raw_score_result': score_result,
246
+ 'test_category': test_category,
247
+ 'underscore_to_dot': self.underscore_to_dot,
248
+ 'is_fc_model': self.is_fc_model
249
+ }
250
+ score.main_score_name = 'acc'
236
251
 
237
- return super().compute_metric(res_dict, **kwargs)
252
+ except Exception:
253
+ logger.error(f'Evaluation failed for sample: {task_state.sample_id}\n{traceback.format_exc()}')
254
+ score.value = {'acc': 0.0}
255
+ score.explanation = 'Evaluation failed with an unexpected error.'
256
+ score.metadata = {'error': traceback.format_exc()}
257
+ score.main_score_name = 'acc'
258
+ return score
@@ -0,0 +1,222 @@
1
+ import json
2
+ import time
3
+ from typing import Any
4
+
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import dict_to_chat_message
7
+ from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput, ModelUsage
8
+ from evalscope.api.tool.tool_info import ToolInfo
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ def predict(model: Model, sample: Sample) -> ModelOutput:
15
+ """Main prediction function for BFCL using the new API framework."""
16
+ # Extract the row data from sample metadata
17
+ row = sample.metadata
18
+ is_fc_model = row.get('is_fc_model', False)
19
+
20
+ if is_fc_model:
21
+ response, model_usage = generate_turn_with_tools(model, row)
22
+ else:
23
+ response, model_usage = generate_turn(model, row)
24
+
25
+ sample.metadata['generation'] = response
26
+ # wrap response with openai types
27
+ return ModelOutput(
28
+ model=model.name,
29
+ choices=[ChatCompletionChoice.from_content(json.dumps(response, ensure_ascii=False, indent=2))],
30
+ model_usage=model_usage,
31
+ time=time.time()
32
+ )
33
+
34
+
35
+ def generate_turn(model: Model, row: dict[str, Any]):
36
+ from bfcl_eval.constants.default_prompts import (
37
+ DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
38
+ MAXIMUM_STEP_LIMIT,
39
+ )
40
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
41
+ from bfcl_eval.model_handler.utils import default_decode_execute_prompting
42
+
43
+ all_model_responses = []
44
+ current_messages = []
45
+ turns = row['turns']
46
+ model_usage = ModelUsage()
47
+
48
+ for turn_idx, messages in enumerate(turns):
49
+ n_steps = 0
50
+ current_responses = []
51
+ current_messages += messages.copy()
52
+
53
+ if str(turn_idx) in row['missing_functions']:
54
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
55
+ new_turn = [{
56
+ 'role':
57
+ 'user',
58
+ 'content':
59
+ DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
60
+ functions=row['missing_functions'][str(turn_idx)]
61
+ ),
62
+ }]
63
+ current_messages += new_turn
64
+
65
+ while True:
66
+ # Create a sample for the current messages
67
+ from evalscope.api.messages.chat_message import dict_to_chat_message
68
+ chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
69
+
70
+ # Get model response using generate method
71
+ model_output = model.generate(chat_messages)
72
+
73
+ # Handle the response based on the model output structure
74
+ message = model_output.message
75
+ model_usage += model_output.usage
76
+
77
+ current_messages.append(message)
78
+ if isinstance(message, str):
79
+ result = message
80
+ else:
81
+ result = message.content
82
+
83
+ logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
84
+ current_responses.append(result)
85
+
86
+ execute_tools = row.get('should_execute_tool_calls', False)
87
+ if execute_tools:
88
+ try:
89
+ tool_calls = default_decode_execute_prompting(result)
90
+ except Exception:
91
+ tool_calls = None
92
+
93
+ if tool_calls is None:
94
+ break
95
+
96
+ tool_outputs, _ = execute_multi_turn_func_call(
97
+ tool_calls,
98
+ initial_config=row['initial_config'],
99
+ involved_classes=row['involved_classes'],
100
+ model_name='evaluator_loop',
101
+ test_entry_id=row['id'],
102
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
103
+ is_evaL_run=False,
104
+ )
105
+ # Append tool outputs to the current messages
106
+ tool_results = []
107
+ for tool_output, tool_call in zip(tool_outputs, tool_calls):
108
+ tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
109
+ current_messages.append({
110
+ 'role': 'user',
111
+ 'content': repr(tool_results),
112
+ })
113
+ else:
114
+ break
115
+
116
+ n_steps += 1
117
+ if n_steps > MAXIMUM_STEP_LIMIT:
118
+ logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
119
+ break
120
+
121
+ all_model_responses.append(current_responses)
122
+
123
+ return all_model_responses, model_usage
124
+
125
+
126
+ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
127
+ from bfcl_eval.constants.default_prompts import DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC, MAXIMUM_STEP_LIMIT
128
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
129
+ from bfcl_eval.model_handler.utils import convert_to_function_call
130
+
131
+ all_model_responses = []
132
+ current_messages = []
133
+ turns = row['turns']
134
+ model_usage = ModelUsage()
135
+
136
+ for turn_idx, messages in enumerate(turns):
137
+ n_steps = 0
138
+ current_responses = []
139
+ current_messages += messages.copy()
140
+ tools = row['tools']
141
+
142
+ if str(turn_idx) in row['missing_functions']:
143
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
144
+ # inject new functions on the fly
145
+ new_tools = row['missing_functions'][str(turn_idx)]
146
+ for new_tool in new_tools:
147
+ cur_tool = new_tool[0]
148
+ # change type to object
149
+ if cur_tool['parameters']['type'] != 'object':
150
+ cur_tool['parameters']['type'] = 'object'
151
+ tools.append({
152
+ 'type': 'function',
153
+ 'function': cur_tool,
154
+ })
155
+ new_turn = [{
156
+ 'role': 'user',
157
+ 'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
158
+ }]
159
+ current_messages += new_turn
160
+
161
+ while True:
162
+ # Create a sample for the current messages with tools
163
+ chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
164
+ current_sample = Sample(
165
+ input=chat_messages,
166
+ target='',
167
+ tools=[ToolInfo.model_validate(tool['function']) for tool in tools],
168
+ )
169
+
170
+ # Get model response
171
+ model_output = model.generate(current_sample.input, tools=current_sample.tools)
172
+
173
+ # Handle the response based on the model output structure
174
+ message = model_output.message
175
+ model_usage += model_output.usage
176
+
177
+ current_messages.append(message)
178
+ if isinstance(message, str):
179
+ model_responses = [message]
180
+ tool_call_strs = None
181
+ elif message.tool_calls:
182
+ model_responses = [{tc.function.name: tc.function.arguments} for tc in message.tool_calls]
183
+ try:
184
+ tool_call_strs = convert_to_function_call(model_responses)
185
+ except Exception as e:
186
+ logger.error(f'Error converting tool calls to function call strings: {e}')
187
+ tool_call_strs = None
188
+ else:
189
+ model_responses = [message.content]
190
+ tool_call_strs = None
191
+
192
+ current_responses.extend(model_responses)
193
+
194
+ execute_tools = row.get('should_execute_tool_calls', False)
195
+ if execute_tools and tool_call_strs is not None:
196
+ tool_outputs, _ = execute_multi_turn_func_call(
197
+ tool_call_strs,
198
+ initial_config=row['initial_config'],
199
+ involved_classes=row['involved_classes'],
200
+ model_name='evaluator_loop',
201
+ test_entry_id=row['id'],
202
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
203
+ is_evaL_run=False,
204
+ )
205
+
206
+ for tc, tool_output in zip(message.tool_calls, tool_outputs, strict=False):
207
+ current_messages.append({
208
+ 'role': 'tool',
209
+ 'tool_call_id': tc.id,
210
+ 'content': json.dumps({'response': tool_output}),
211
+ })
212
+ else:
213
+ break
214
+
215
+ n_steps += 1
216
+ if n_steps > MAXIMUM_STEP_LIMIT:
217
+ logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
218
+ break
219
+
220
+ all_model_responses.append(current_responses)
221
+
222
+ return all_model_responses, model_usage