evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,17 +1,16 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import json
4
3
  import os
5
- import random
6
4
  import re
5
+ from typing import Any, Dict
7
6
 
8
- from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.constants import AnswerKeys
10
- from evalscope.metrics import exact_match
7
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
8
+ from evalscope.api.dataset import Sample
9
+ from evalscope.api.evaluator import TaskState
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
11
12
  from evalscope.utils.logger import get_logger
12
13
 
13
- # flake8: noqa
14
-
15
14
  logger = get_logger()
16
15
 
17
16
  # BBH multiple choice subset list
@@ -55,160 +54,89 @@ FREE_FORM_LIST = [
55
54
  TASK_TYPE = 'task_type'
56
55
  SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
57
56
 
58
-
59
- @Benchmark.register(
60
- name='bbh',
61
- pretty_name='BBH',
62
- tags=['Reasoning'],
63
- description=
64
- 'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
65
- dataset_id='modelscope/bbh',
66
- subset_list=SUBSET_LIST,
67
- metric_list=['AverageAccuracy'],
68
- few_shot_num=3,
69
- train_split=None,
70
- eval_split='test',
71
- prompt_template="Q: {query}\nA: Let's think step by step.",
57
+ PROMPT_TEMPLATE = """
58
+ Q: {question}
59
+ A: Let's think step by step. Put your final answer in the format of "So the answer is $ANSWER" (without quotes and markdown) where $ANSWER is the answer to the problem.
60
+ """.lstrip() # noqa: E501
61
+
62
+ FEWSHOT_TEMPLATE = """
63
+ {fewshot}
64
+
65
+ """.lstrip() + PROMPT_TEMPLATE
66
+
67
+
68
+ @register_benchmark(
69
+ BenchmarkMeta(
70
+ name='bbh',
71
+ pretty_name='BBH',
72
+ dataset_id='evalscope/bbh',
73
+ tags=[Tags.REASONING],
74
+ description=
75
+ 'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
76
+ subset_list=SUBSET_LIST,
77
+ few_shot_num=3,
78
+ train_split=None,
79
+ eval_split='test',
80
+ metric_list=['acc'],
81
+ prompt_template=PROMPT_TEMPLATE,
82
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
83
+ )
72
84
  )
73
- class BBHAdapter(DataAdapter):
85
+ class BBHAdapter(DefaultDataAdapter):
74
86
  """
75
87
  Adapter for BBH free-form and multiple-choices sub-tasks.
76
88
  """
77
89
 
78
90
  def __init__(self, **kwargs):
79
-
80
91
  few_shot_num = kwargs.get('few_shot_num', 3)
81
92
 
82
93
  if few_shot_num != 3 and few_shot_num != 0:
83
- logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
84
- f'Use 3-shot by default.')
94
+ logger.error(
95
+ f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
96
+ f'Use 3-shot by default.'
97
+ )
85
98
  kwargs['few_shot_num'] = 3
86
99
 
87
100
  super().__init__(**kwargs)
88
101
 
89
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
90
- data_dict = {}
91
- for subset_name in subset_list:
92
- for split_name in [self.eval_split]:
93
- if os.path.exists(dataset_name_or_path):
94
- file_path = os.path.join(dataset_name_or_path, f'{subset_name}.json')
95
- else:
96
- file_path: str = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}.json')
97
- if os.path.exists(file_path):
98
- with open(file_path, 'r', encoding='utf-8') as f:
99
- examples = json.load(f)['examples']
100
- if subset_name in data_dict:
101
- data_dict[subset_name].update({split_name: examples})
102
- else:
103
- data_dict[subset_name] = {split_name: examples}
104
-
105
- return data_dict
106
-
107
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
108
- """
109
- Generate model prompt from raw data, unify the prompt format for bbh(multiple choice) benchmark.
110
-
111
- Args:
112
- input_d (dict): The raw input. A single data format of the BBH:
113
-
114
- {
115
- 'input': '((-1 + 2 + 9 * 5) - (-2 + -4 + -4 * -7)) =',
116
- 'target': '24',
117
- }
118
-
119
- Returns:
120
- {'data': ['xxx']}
121
- """
122
- # few_shot_list: should be ['xxxx']
123
- if len(few_shot_list) > 0:
124
- cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
125
- else:
126
- cot_prompts = ''
127
- full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
128
-
129
- return self.gen_prompt_data(full_prompt)
130
-
131
- def gen_prompts(self, data_dict: dict) -> dict:
132
- """
133
- Generate dataset prompts from raw input, unify the prompt format for different datasets.
134
-
135
- Args:
136
- data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
137
-
138
- Returns:
139
- {'subset_name': [prompt_d_1, prompt_d_2, ...]}
140
- prompt_d_i (dict): refer to the output of gen_prompt method.
141
-
142
- e.g. train -- few-shot data, test -- target dataset to evaluate.
143
- """
144
- res_dict: dict = {}
145
-
146
- if self.few_shot_num < 0:
147
- raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
148
-
149
- logger.info(f'Use default settings: '
150
- f'> few_shot_num: {self.few_shot_num}, '
151
- f'> few_shot_split: {self.train_split}, '
152
- f'> target_eval_split: {self.eval_split}')
153
-
154
- for sub_name, sub_data_dict in data_dict.items():
155
- few_shot_data = []
156
- if self.few_shot_num > 0:
157
- with open(
158
- os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r',
159
- encoding='utf-8') as f:
160
- cot_prompt_str = f.read()
161
- few_shot_data = [cot_prompt_str]
162
-
163
- res_dict[sub_name] = []
164
- for sample_d in sub_data_dict[self.eval_split]:
165
- prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=few_shot_data)
166
- sample_d_new = sample_d.copy()
167
- if sub_name in MULTIPLE_CHOICE_LIST:
168
- sample_d_new[TASK_TYPE] = MULTIPLE_CHOICE
169
- elif sub_name in FREE_FORM_LIST:
170
- sample_d_new[TASK_TYPE] = FREE_FORM
171
- else:
172
- raise ValueError(f'Invalid subset name: {sub_name}')
173
-
174
- prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
175
- res_dict[sub_name].append(prompt_d)
176
-
177
- return res_dict
178
-
179
- def get_gold_answer(self, input_d: dict) -> str:
180
- # Get the gold choice
181
- gold = input_d.get('target', '')
182
- # remove brackets
183
- if gold is None:
184
- logger.error(f'BBHAdapter: gold is None.')
185
- gold = gold.replace('(', '').replace(')', '')
186
- return gold
187
-
188
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
189
- """
190
- Parse the model output to get the answer. Could be the best choice index.
191
-
192
- Args:
193
- result: Predicted answer from the model. Usually a string for chat.
194
- raw_input_d (dict): The raw input. Depending on the dataset.
195
- eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
196
-
197
- Returns:
198
- The parsed answer. Depending on the dataset. Usually a string for chat.
199
- """
200
- # Note: to use same extraction method for both of checkpoint/service/custom.
201
- task_type: str = raw_input_d.get(TASK_TYPE)
102
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
103
+ input = record['input']
104
+ target = record['target'].replace('(', '').replace(')', '').strip() # Clean up the target answer
105
+
106
+ # Determine task type based on subset name
107
+ task_type = None
108
+ subset_name = self.current_subset_name
109
+ if subset_name in MULTIPLE_CHOICE_LIST:
110
+ task_type = MULTIPLE_CHOICE
111
+ elif subset_name in FREE_FORM_LIST:
112
+ task_type = FREE_FORM
113
+
114
+ metadata = {TASK_TYPE: task_type}
115
+
116
+ return Sample(input=input, target=target, metadata=metadata, subset_key=subset_name)
117
+
118
+ def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
119
+ # Load CoT prompts from file for BBH
120
+ subset_name = sample.subset_key
121
+ if subset_name:
122
+ cot_file_path = os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{subset_name}.txt')
123
+ if os.path.exists(cot_file_path):
124
+ with open(cot_file_path, 'r', encoding='utf-8') as f:
125
+ fewshot = f.read().strip()
126
+ return self.few_shot_prompt_template.format(
127
+ fewshot=fewshot,
128
+ question=sample.input,
129
+ )
130
+
131
+ def extract_answer(self, prediction: str, task_state: TaskState):
132
+ task_type = task_state.metadata.get(TASK_TYPE)
202
133
 
203
134
  if task_type == MULTIPLE_CHOICE:
204
- return self._extract_mc_answer(result)
135
+ return self._extract_mc_answer(prediction)
205
136
  elif task_type == FREE_FORM:
206
- return self._extract_ff_answer(result)
137
+ return self._extract_ff_answer(prediction)
207
138
  else:
208
- raise ValueError(f'Invalid task type: {task_type}')
209
-
210
- def match(self, gold: str, pred: str) -> float:
211
- return exact_match(gold=gold, pred=pred)
139
+ return prediction.strip()
212
140
 
213
141
  @classmethod
214
142
  def _extract_mc_answer(cls, ans: str) -> str:
@@ -1,12 +1,17 @@
1
- import copy
2
- import importlib
3
1
  import json
4
2
  import re
5
3
  import traceback
6
- from typing import Any, List
7
-
8
- from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.constants import EvalType
4
+ from typing import Any, Dict
5
+
6
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.messages.chat_message import ChatMessageUser
10
+ from evalscope.api.metric import Score
11
+ from evalscope.api.model import Model, ModelOutput
12
+ from evalscope.api.registry import register_benchmark
13
+ from evalscope.constants import Tags
14
+ from evalscope.utils.import_utils import check_import
10
15
  from evalscope.utils.logger import get_logger
11
16
 
12
17
  logger = get_logger()
@@ -32,47 +37,43 @@ SUBJECT_MAPPING = {
32
37
  }
33
38
 
34
39
 
35
- @Benchmark.register(
36
- name='bfcl_v3',
37
- pretty_name='BFCL-v3',
38
- tags=['Agent', 'Function Calling'],
39
- description=
40
- 'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
41
- 'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
42
- 'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
43
- 'Need to run `pip install bfcl-eval` before evaluating. '
44
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)', # noqa: E501
45
- dataset_id='AI-ModelScope/bfcl_v3',
46
- subset_list=list(SUBJECT_MAPPING.keys()),
47
- model_adapter='bfcl_server',
48
- metric_list=['AverageAccuracy'],
49
- few_shot_num=0,
50
- train_split=None,
51
- eval_split='train',
52
- extra_params={
53
- 'underscore_to_dot': True,
54
- 'is_fc_model': True,
55
- })
56
- class BFCLAdapter(DataAdapter):
40
+ @register_benchmark(
41
+ BenchmarkMeta(
42
+ name='bfcl_v3',
43
+ pretty_name='BFCL-v3',
44
+ tags=[Tags.FUNCTION_CALLING],
45
+ description='Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive '
46
+ 'and executable function call evaluation** '
47
+ 'dedicated to assessing Large Language Models\' (LLMs) ability to invoke '
48
+ 'functions. Unlike previous evaluations, '
49
+ 'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
50
+ 'Need to run `pip install bfcl-eval==2025.6.16` before evaluating. '
51
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)',
52
+ dataset_id='AI-ModelScope/bfcl_v3',
53
+ subset_list=list(SUBJECT_MAPPING.keys()),
54
+ metric_list=['acc'],
55
+ eval_split='train',
56
+ extra_params={
57
+ 'underscore_to_dot': True,
58
+ 'is_fc_model': True,
59
+ }
60
+ )
61
+ )
62
+ class BFCLAdapter(DefaultDataAdapter):
63
+ """
64
+ BFCL adapter using the new data processing framework.
65
+ """
57
66
 
58
67
  def __init__(self, **kwargs):
59
68
  super().__init__(**kwargs)
60
69
 
61
- spec = importlib.util.find_spec('bfcl_eval')
62
- if spec is None:
63
- raise ImportError(
64
- '`bfcl_eval` not found, please install it with `pip install bfcl-eval` before evaluating.')
70
+ check_import('bfcl_eval', package='bfcl-eval==2025.6.16', raise_error=True)
65
71
 
66
72
  self.category_map = SUBJECT_MAPPING
73
+ self.reformat_subset = True
67
74
 
68
- extra_params = kwargs.get('extra_params', {})
69
- self.underscore_to_dot = extra_params.get('underscore_to_dot', False)
70
- self.is_fc_model = extra_params.get('is_fc_model', True)
71
-
72
- def load(self, **kwargs):
73
- kwargs['subset_list'] = ['default']
74
- data_dict = super().load(**kwargs)
75
- return self.reformat_subset(data_dict, subset_key='subset', format='{}')
75
+ self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
76
+ self.is_fc_model = self.extra_params.get('is_fc_model', True)
76
77
 
77
78
  def preprocess_row(self, row: dict):
78
79
  """
@@ -87,151 +88,167 @@ class BFCLAdapter(DataAdapter):
87
88
  row['initial_config'] = json.loads(row['initial_config'])
88
89
  row['is_fc_model'] = self.is_fc_model
89
90
 
90
- def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
91
- self.preprocess_row(input_d)
91
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
92
+ """Convert a data record to a Sample object."""
93
+ self.preprocess_row(record)
92
94
 
93
95
  # If the model is a function calling model, we need to remove the system prompt
94
96
  if self.is_fc_model:
95
- turns = input_d['turns']
97
+ turns = record['turns']
96
98
  new_turns = []
97
99
  for turn_idx, messages in enumerate(turns):
98
100
  current_messages = messages.copy()
99
101
  if len(current_messages) > 0 and current_messages[0]['role'] == 'system':
100
102
  current_messages = current_messages[1:]
101
103
  new_turns.append(current_messages)
102
- input_d['turns'] = new_turns
103
-
104
- return self.gen_prompt_data(prompt='', messages=input_d)
105
-
106
- def get_gold_answer(self, input_d: dict) -> str:
107
- # Get the gold choice
108
- return input_d.get('ground_truth', )
109
-
110
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> dict:
111
- row = copy.deepcopy(raw_input_d)
112
- del row['turns'] # Remove turns as they are not needed for the match function
113
-
114
- row['generation'] = result
115
- return row
116
-
117
- def match(self, gold: dict, pred: dict) -> dict:
104
+ record['turns'] = new_turns
105
+
106
+ return Sample(
107
+ input=[ChatMessageUser(content='')],
108
+ target='', # Will use the record for evaluation
109
+ subset_key=record['subset'],
110
+ metadata=record # Store the full record for evaluation
111
+ )
112
+
113
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
114
+ from .generation import predict
115
+ return predict(model, sample)
116
+
117
+ def match_score(
118
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
119
+ ) -> Score:
118
120
  from bfcl_eval.eval_checker.ast_eval.ast_checker import ast_checker
119
121
  from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import multi_turn_checker
120
- from bfcl_eval.model_handler.utils import (convert_to_function_call, default_decode_ast_prompting,
121
- default_decode_execute_prompting)
122
+ from bfcl_eval.model_handler.utils import (
123
+ convert_to_function_call,
124
+ default_decode_ast_prompting,
125
+ default_decode_execute_prompting,
126
+ )
122
127
  from bfcl_eval.utils import is_empty_output
123
128
 
124
- # NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
125
- # which decides if model was provided with functions of the type
126
- # spotify.list_songs or spotify_list_songs
127
- # It is False for all llama models (when using via prompting)
128
- # and True for API calls
129
- if self.underscore_to_dot:
130
- dummy_model = 'gpt-4o-2024-11-20-FC'
131
- else:
132
- dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
133
-
134
- row = pred
135
- test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
136
- if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
137
- error = None
138
- try:
139
- if self.is_fc_model:
140
- decoded_tool_calls = []
141
- for tool_call in row['generation'][0]:
142
- name = list(tool_call.keys())[0]
143
- params = json.loads(tool_call[name])
144
- decoded_tool_calls.append({name: params})
145
- else:
146
- decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
147
-
148
- # successful decode means valid function call was present
149
- contains_func_call = True
150
- if is_empty_output(decoded_tool_calls):
151
- # Empty output is not considered as a valid function call
129
+ score = Score(
130
+ extracted_prediction=filtered_prediction,
131
+ prediction=original_prediction,
132
+ )
133
+
134
+ try:
135
+ # NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
136
+ if self.underscore_to_dot:
137
+ dummy_model = 'gpt-4o-2024-11-20-FC'
138
+ else:
139
+ dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
140
+
141
+ row = task_state.metadata
142
+ test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
143
+
144
+ if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
145
+ error = None
146
+ try:
147
+ if self.is_fc_model:
148
+ decoded_tool_calls = []
149
+ for tool_call in row['generation'][0]:
150
+ name = list(tool_call.keys())[0]
151
+ params = tool_call[name]
152
+ decoded_tool_calls.append({name: params})
153
+ else:
154
+ decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
155
+
156
+ # successful decode means valid function call was present
157
+ contains_func_call = True
158
+ if is_empty_output(decoded_tool_calls):
159
+ # Empty output is not considered as a valid function call
160
+ contains_func_call = False
161
+ error = 'Empty decoded output.'
162
+ except Exception:
152
163
  contains_func_call = False
153
- error = 'Empty decoded output.'
154
- except Exception:
155
- contains_func_call = False
156
- error = f'Failed to decode with traceback: {traceback.format_exc()}'
157
- finally:
158
- valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
159
- score_result = {'valid': valid, 'error_message': error}
160
-
161
- elif row['multi_turn']:
162
- # each step might give a list of tool calls and each turn is multi-step
163
- # and multi-turn has generations of all the turns
164
- # hence in a multi-turn setting,
165
- # multi_turn_decoded_generations is a list of list of list of strings
166
- multi_turn_decoded_generations: list[list[list[str]]] = []
167
- for single_turn_generations in row['generation']:
168
- single_turn_decoded_generations: list[list[str]] = []
169
- for generation in single_turn_generations:
170
- try:
171
- if self.is_fc_model:
172
- tool_calls = convert_to_function_call(generation)
173
- else:
174
- tool_calls = default_decode_execute_prompting(generation)
175
-
176
- single_turn_decoded_generations.append(tool_calls)
177
- except Exception:
178
- single_turn_decoded_generations.append([generation])
179
-
180
- multi_turn_decoded_generations.append(single_turn_decoded_generations)
181
-
182
- try:
183
- raw_score_result = multi_turn_checker(
184
- multi_turn_decoded_generations,
185
- row['ground_truth'],
186
- row,
187
- test_category,
188
- dummy_model,
189
- )
190
- except Exception:
191
- raw_score_result = {
192
- 'valid': False,
193
- 'error_type': 'multi_turn:checker_failed',
194
- 'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
195
- }
164
+ error = f'Failed to decode with traceback: {traceback.format_exc()}'
165
+ finally:
166
+ valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
167
+ score_result = {'valid': valid, 'error_message': error}
168
+
169
+ elif row['multi_turn']:
170
+ # each step might give a list of tool calls and each turn is multi-step
171
+ # and multi-turn has generations of all the turns
172
+ # hence in a multi-turn setting,
173
+ # multi_turn_decoded_generations is a list of list of list of strings
174
+ multi_turn_decoded_generations: list[list[list[str]]] = []
175
+ for single_turn_generations in row['generation']:
176
+ single_turn_decoded_generations: list[list[str]] = []
177
+ for generation in single_turn_generations:
178
+ try:
179
+ if self.is_fc_model:
180
+ tool_calls = convert_to_function_call(generation)
181
+ else:
182
+ tool_calls = default_decode_execute_prompting(generation)
183
+
184
+ single_turn_decoded_generations.append(tool_calls)
185
+ except Exception:
186
+ single_turn_decoded_generations.append([generation])
187
+
188
+ multi_turn_decoded_generations.append(single_turn_decoded_generations)
189
+
190
+ try:
191
+ raw_score_result = multi_turn_checker(
192
+ multi_turn_decoded_generations,
193
+ row['ground_truth'],
194
+ row,
195
+ test_category,
196
+ dummy_model,
197
+ )
198
+ except Exception:
199
+ raw_score_result = {
200
+ 'valid': False,
201
+ 'error_type': 'multi_turn:checker_failed',
202
+ 'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
203
+ }
196
204
 
197
- score_result = {
198
- 'valid': float(raw_score_result['valid']),
199
- 'error_message': raw_score_result.get('error_message', ''),
200
- 'error_type': raw_score_result.get('error_type', ''),
201
- }
202
- else:
203
- try:
204
- if self.is_fc_model:
205
- decoded_tool_calls = []
206
- for tool_call in row['generation'][0]:
207
- name = list(tool_call.keys())[0]
208
- params = json.loads(tool_call[name])
209
- decoded_tool_calls.append({name: params})
210
- else:
211
- decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
212
-
213
- score_result = ast_checker(
214
- row['functions'],
215
- decoded_tool_calls,
216
- row['ground_truth'],
217
- row['language'],
218
- row['test_category'],
219
- dummy_model,
220
- )
221
- except Exception:
222
205
  score_result = {
223
- 'valid': False,
224
- 'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
225
- 'error_type': 'ast_decoder:decoder_failed',
206
+ 'valid': float(raw_score_result['valid']),
207
+ 'error_message': raw_score_result.get('error_message', ''),
208
+ 'error_type': raw_score_result.get('error_type', ''),
226
209
  }
227
-
228
- return {
229
- 'AverageAccuracy': float(score_result['valid']),
230
- 'raw_score_result': score_result,
231
- }
232
-
233
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
234
- # aggregate review results
235
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
236
-
237
- return super().compute_metric(res_dict, **kwargs)
210
+ else:
211
+ try:
212
+ if self.is_fc_model:
213
+ decoded_tool_calls = []
214
+ for tool_call in row['generation'][0]:
215
+ name = list(tool_call.keys())[0]
216
+ params = tool_call[name]
217
+ decoded_tool_calls.append({name: params})
218
+ else:
219
+ decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
220
+
221
+ score_result = ast_checker(
222
+ row['functions'],
223
+ decoded_tool_calls,
224
+ row['ground_truth'],
225
+ row['language'],
226
+ row['test_category'],
227
+ dummy_model,
228
+ )
229
+ except Exception:
230
+ score_result = {
231
+ 'valid': False,
232
+ 'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
233
+ 'error_type': 'ast_decoder:decoder_failed',
234
+ }
235
+
236
+ score.value = {
237
+ 'acc': float(score_result['valid']),
238
+ }
239
+ score.explanation = score_result.get('error_message', 'Evaluation completed')
240
+ score.metadata = {
241
+ 'raw_score_result': score_result,
242
+ 'test_category': test_category,
243
+ 'underscore_to_dot': self.underscore_to_dot,
244
+ 'is_fc_model': self.is_fc_model
245
+ }
246
+ score.main_score_name = 'acc'
247
+
248
+ except Exception:
249
+ logger.error(f'Evaluation failed for sample: {task_state.sample_id}\n{traceback.format_exc()}')
250
+ score.value = {'acc': 0.0}
251
+ score.explanation = 'Evaluation failed with an unexpected error.'
252
+ score.metadata = {'error': traceback.format_exc()}
253
+ score.main_score_name = 'acc'
254
+ return score