evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,222 @@
1
+ import json
2
+ import time
3
+ from typing import Any
4
+
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import dict_to_chat_message
7
+ from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput, ModelUsage
8
+ from evalscope.api.tool.tool_info import ToolInfo
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ def predict(model: Model, sample: Sample) -> ModelOutput:
15
+ """Main prediction function for BFCL using the new API framework."""
16
+ # Extract the row data from sample metadata
17
+ row = sample.metadata
18
+ is_fc_model = row.get('is_fc_model', False)
19
+
20
+ if is_fc_model:
21
+ response, model_usage = generate_turn_with_tools(model, row)
22
+ else:
23
+ response, model_usage = generate_turn(model, row)
24
+
25
+ sample.metadata['generation'] = response
26
+ # wrap response with openai types
27
+ return ModelOutput(
28
+ model=model.name,
29
+ choices=[ChatCompletionChoice.from_content(json.dumps(response, ensure_ascii=False, indent=2))],
30
+ model_usage=model_usage,
31
+ time=time.time()
32
+ )
33
+
34
+
35
+ def generate_turn(model: Model, row: dict[str, Any]):
36
+ from bfcl_eval.constants.default_prompts import (
37
+ DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
38
+ MAXIMUM_STEP_LIMIT,
39
+ )
40
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
41
+ from bfcl_eval.model_handler.utils import default_decode_execute_prompting
42
+
43
+ all_model_responses = []
44
+ current_messages = []
45
+ turns = row['turns']
46
+ model_usage = ModelUsage()
47
+
48
+ for turn_idx, messages in enumerate(turns):
49
+ n_steps = 0
50
+ current_responses = []
51
+ current_messages += messages.copy()
52
+
53
+ if str(turn_idx) in row['missing_functions']:
54
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
55
+ new_turn = [{
56
+ 'role':
57
+ 'user',
58
+ 'content':
59
+ DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
60
+ functions=row['missing_functions'][str(turn_idx)]
61
+ ),
62
+ }]
63
+ current_messages += new_turn
64
+
65
+ while True:
66
+ # Create a sample for the current messages
67
+ from evalscope.api.messages.chat_message import dict_to_chat_message
68
+ chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
69
+
70
+ # Get model response using generate method
71
+ model_output = model.generate(chat_messages)
72
+
73
+ # Handle the response based on the model output structure
74
+ message = model_output.message
75
+ model_usage += model_output.usage
76
+
77
+ current_messages.append(message)
78
+ if isinstance(message, str):
79
+ result = message
80
+ else:
81
+ result = message.text
82
+
83
+ logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
84
+ current_responses.append(result)
85
+
86
+ execute_tools = row.get('should_execute_tool_calls', False)
87
+ if execute_tools:
88
+ try:
89
+ tool_calls = default_decode_execute_prompting(result)
90
+ except Exception:
91
+ tool_calls = None
92
+
93
+ if tool_calls is None:
94
+ break
95
+
96
+ tool_outputs, _ = execute_multi_turn_func_call(
97
+ tool_calls,
98
+ initial_config=row['initial_config'],
99
+ involved_classes=row['involved_classes'],
100
+ model_name='evaluator_loop',
101
+ test_entry_id=row['id'],
102
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
103
+ is_evaL_run=False,
104
+ )
105
+ # Append tool outputs to the current messages
106
+ tool_results = []
107
+ for tool_output, tool_call in zip(tool_outputs, tool_calls):
108
+ tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
109
+ current_messages.append({
110
+ 'role': 'user',
111
+ 'content': repr(tool_results),
112
+ })
113
+ else:
114
+ break
115
+
116
+ n_steps += 1
117
+ if n_steps > MAXIMUM_STEP_LIMIT:
118
+ logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
119
+ break
120
+
121
+ all_model_responses.append(current_responses)
122
+
123
+ return all_model_responses, model_usage
124
+
125
+
126
+ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
127
+ from bfcl_eval.constants.default_prompts import DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC, MAXIMUM_STEP_LIMIT
128
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
129
+ from bfcl_eval.model_handler.utils import convert_to_function_call
130
+
131
+ all_model_responses = []
132
+ current_messages = []
133
+ turns = row['turns']
134
+ model_usage = ModelUsage()
135
+
136
+ for turn_idx, messages in enumerate(turns):
137
+ n_steps = 0
138
+ current_responses = []
139
+ current_messages += messages.copy()
140
+ tools = row['tools']
141
+
142
+ if str(turn_idx) in row['missing_functions']:
143
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
144
+ # inject new functions on the fly
145
+ new_tools = row['missing_functions'][str(turn_idx)]
146
+ for new_tool in new_tools:
147
+ cur_tool = new_tool[0]
148
+ # change type to object
149
+ if cur_tool['parameters']['type'] != 'object':
150
+ cur_tool['parameters']['type'] = 'object'
151
+ tools.append({
152
+ 'type': 'function',
153
+ 'function': cur_tool,
154
+ })
155
+ new_turn = [{
156
+ 'role': 'user',
157
+ 'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
158
+ }]
159
+ current_messages += new_turn
160
+
161
+ while True:
162
+ # Create a sample for the current messages with tools
163
+ chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
164
+ current_sample = Sample(
165
+ input=chat_messages,
166
+ target='',
167
+ tools=[ToolInfo.model_validate(tool['function']) for tool in tools],
168
+ )
169
+
170
+ # Get model response
171
+ model_output = model.generate(current_sample.input, tools=current_sample.tools)
172
+
173
+ # Handle the response based on the model output structure
174
+ message = model_output.message
175
+ model_usage += model_output.usage
176
+
177
+ current_messages.append(message)
178
+ if isinstance(message, str):
179
+ model_responses = [message]
180
+ tool_call_strs = None
181
+ elif message.tool_calls:
182
+ model_responses = [{tc.function.name: tc.function.arguments} for tc in message.tool_calls]
183
+ try:
184
+ tool_call_strs = convert_to_function_call(model_responses)
185
+ except Exception as e:
186
+ logger.error(f'Error converting tool calls to function call strings: {e}')
187
+ tool_call_strs = None
188
+ else:
189
+ model_responses = [message.text]
190
+ tool_call_strs = None
191
+
192
+ current_responses.extend(model_responses)
193
+
194
+ execute_tools = row.get('should_execute_tool_calls', False)
195
+ if execute_tools and tool_call_strs is not None:
196
+ tool_outputs, _ = execute_multi_turn_func_call(
197
+ tool_call_strs,
198
+ initial_config=row['initial_config'],
199
+ involved_classes=row['involved_classes'],
200
+ model_name='evaluator_loop',
201
+ test_entry_id=row['id'],
202
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
203
+ is_evaL_run=False,
204
+ )
205
+
206
+ for tc, tool_output in zip(message.tool_calls, tool_outputs, strict=False):
207
+ current_messages.append({
208
+ 'role': 'tool',
209
+ 'tool_call_id': tc.id,
210
+ 'content': json.dumps({'response': tool_output}),
211
+ })
212
+ else:
213
+ break
214
+
215
+ n_steps += 1
216
+ if n_steps > MAXIMUM_STEP_LIMIT:
217
+ logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
218
+ break
219
+
220
+ all_model_responses.append(current_responses)
221
+
222
+ return all_model_responses, model_usage
@@ -1,73 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import csv
3
- import os
4
- from collections import defaultdict
5
-
6
- from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType, OutputType
8
- from evalscope.metrics import exact_match
9
- from evalscope.metrics.completion_parsers import ResponseParser
10
- from evalscope.utils.io_utils import csv_to_list
11
- from evalscope.utils.logger import get_logger
12
2
 
13
- # flake8: noqa
3
+ from typing import Any, Dict
14
4
 
15
- logger = get_logger()
5
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.logger import get_logger
16
10
 
17
- SUBSET_LIST = [
18
- 'computer_network',
19
- 'operating_system',
20
- 'computer_architecture',
21
- 'college_programming',
22
- 'college_physics',
23
- 'college_chemistry',
24
- 'advanced_mathematics',
25
- 'probability_and_statistics',
26
- 'discrete_mathematics',
27
- 'electrical_engineer',
28
- 'metrology_engineer',
29
- 'high_school_mathematics',
30
- 'high_school_physics',
31
- 'high_school_chemistry',
32
- 'high_school_biology',
33
- 'middle_school_mathematics',
34
- 'middle_school_biology',
35
- 'middle_school_physics',
36
- 'middle_school_chemistry',
37
- 'veterinary_medicine',
38
- 'college_economics',
39
- 'business_administration',
40
- 'marxism',
41
- 'mao_zedong_thought',
42
- 'education_science',
43
- 'teacher_qualification',
44
- 'high_school_politics',
45
- 'high_school_geography',
46
- 'middle_school_politics',
47
- 'middle_school_geography',
48
- 'modern_chinese_history',
49
- 'ideological_and_moral_cultivation',
50
- 'logic',
51
- 'law',
52
- 'chinese_language_and_literature',
53
- 'art_studies',
54
- 'professional_tour_guide',
55
- 'legal_professional',
56
- 'high_school_chinese',
57
- 'high_school_history',
58
- 'middle_school_history',
59
- 'civil_servant',
60
- 'sports_science',
61
- 'plant_protection',
62
- 'basic_medicine',
63
- 'clinical_medicine',
64
- 'urban_and_rural_planner',
65
- 'accountant',
66
- 'fire_engineer',
67
- 'environmental_impact_assessment_engineer',
68
- 'tax_accountant',
69
- 'physician',
70
- ]
11
+ logger = get_logger()
71
12
 
72
13
  SUBJECT_MAPPING = {
73
14
  'computer_network': ['Computer Network', '计算机网络', 'STEM'],
@@ -124,115 +65,105 @@ SUBJECT_MAPPING = {
124
65
  'physician': ['Physician', '医师资格', 'Other']
125
66
  }
126
67
 
127
-
128
- @Benchmark.register(
129
- name='ceval',
130
- pretty_name='C-Eval',
131
- tags=['Knowledge', 'MCQ', 'Chinese'],
132
- description=
133
- 'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.', # noqa: E501
134
- dataset_id='modelscope/ceval-exam',
135
- model_adapter=OutputType.GENERATION,
136
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
137
- subset_list=SUBSET_LIST,
138
- metric_list=['AverageAccuracy'],
139
- few_shot_num=0,
140
- train_split='dev',
141
- eval_split='val',
142
- prompt_template=
143
- '以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
68
+ # Based on the prompt template for Chinese evaluation
69
+ USER_PROMPT_TEMPLATE = """以下是中国关于{subject}的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"答案:LETTER"(不带引号),其中 LETTER 是 A、B、C、D 中的一个。
70
+
71
+ 问题:{question}
72
+ 选项:
73
+ {choices}
74
+ """.lstrip() # noqa: E501
75
+
76
+ FEWSHOT_TEMPLATE = """以下是一些示例问题:
77
+
78
+ {fewshot}
79
+
80
+ """.lstrip()
81
+
82
+
83
+ @register_benchmark(
84
+ BenchmarkMeta(
85
+ name='ceval',
86
+ pretty_name='C-Eval',
87
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.CHINESE],
88
+ description=
89
+ 'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.', # noqa: E501
90
+ dataset_id='evalscope/ceval',
91
+ subset_list=list(SUBJECT_MAPPING.keys()),
92
+ metric_list=['acc'],
93
+ few_shot_num=5,
94
+ train_split='dev',
95
+ eval_split='val',
96
+ prompt_template=USER_PROMPT_TEMPLATE,
97
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
98
+ )
144
99
  )
145
- class CEVALAdapter(DataAdapter):
100
+ class CEVALAdapter(MultiChoiceAdapter):
146
101
 
147
102
  def __init__(self, **kwargs):
148
103
 
149
- few_shot_num = kwargs.get('few_shot_num', 0)
150
- if few_shot_num > 5:
151
- logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
152
- kwargs['few_shot_num'] = 5
153
104
  super().__init__(**kwargs)
154
105
 
155
106
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
156
- self.choices = ['A', 'B', 'C', 'D']
157
-
158
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
159
- data_dict = defaultdict(dict)
160
- for subset_name in subset_list:
161
- for split_name in [self.train_split, self.eval_split]:
162
- if os.path.exists(dataset_name_or_path):
163
- file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}.csv')
164
- else:
165
- file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
166
- if os.path.exists(file_path):
167
- data_dict[subset_name][split_name] = csv_to_list(file_path)
168
-
169
- return data_dict
170
-
171
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
172
- """
173
- Generate model prompt from raw input, unify the prompt format for C-Eval benchmark.
174
-
175
- Args:
176
- input_d (dict): The raw input. A single data format of the C-Eval:
177
-
178
- {'id': 0,
179
- 'question': '下列关于税法基本原则的表述中,不正确的是____。',
180
- 'A': '税收法定原则包括税收要件法定原则和税务合法性原则',
181
- 'B': '税收公平原则源于法律上的平等性原则',
182
- 'C': '税收效率原则包含经济效率和行政效率两个方面',
183
- 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定',
184
- 'answer': 'D',
185
- 'explanation': ''}
186
-
187
- Returns:
188
- {'data': ['prompt ...']}
189
- """
190
-
191
- few_shot_prompts = [self._format_example(input_d=sample, include_answer=True) for sample in few_shot_list]
192
-
193
- if len(few_shot_prompts) > 0:
194
- context: str = '\n'.join(few_shot_prompts) + '\n'
195
- else:
196
- context = ''
197
-
198
- query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
199
-
200
- subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
201
- full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
202
107
 
203
- return self.gen_prompt_data(full_prompt)
204
-
205
- def get_gold_answer(self, input_d: dict) -> str:
206
- # Get the gold choice
207
- return input_d.get('answer', '')
208
-
209
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
108
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
109
+ # Build choices list from A, B, C, D fields
110
+ choices = [record['A'], record['B'], record['C'], record['D']]
111
+ subset = self.current_subset_name
112
+
113
+ return Sample(
114
+ input=record['question'],
115
+ choices=choices,
116
+ target=record['answer'],
117
+ metadata={
118
+ 'id': record.get('id', ''),
119
+ 'explanation': record.get('explanation', ''),
120
+ 'subject': subset
121
+ },
122
+ )
123
+
124
+ def sample_to_fewshot(self, sample: Sample) -> str:
125
+ q_str = f"""问题:{sample.input}"""
126
+ choices = sample.choices if sample.choices is not None else []
127
+ opt_str_list = []
128
+ for i, choice in enumerate(choices):
129
+ opt_str_list.append(f"""{chr(65 + i)}. {choice}""")
130
+ opt_str = '\n'.join(opt_str_list)
131
+ opt_str = f"""选项:\n{opt_str}"""
132
+ exp_str = f"""解析:{sample.metadata.get('explanation', '')}"""
133
+ ans_str = f"""答案:{sample.target}"""
134
+ final_str = '\n'.join([q_str, opt_str, exp_str, ans_str])
135
+
136
+ return final_str
137
+
138
+ def format_fewshot_template(self, fewshot, sample):
139
+ fewshot_str = FEWSHOT_TEMPLATE.format(fewshot=fewshot)
140
+ prompt_str = self.format_prompt_template(sample)
141
+ return fewshot_str + '\n' + prompt_str
142
+
143
+ def format_prompt_template(self, sample):
144
+ subject_name = SUBJECT_MAPPING.get(sample.metadata['subject'])[1]
145
+ choices = sample.choices if sample.choices is not None else []
146
+ choices_str = '\n'.join([f'{chr(65 + i)}. {choice}' for i, choice in enumerate(choices)])
147
+
148
+ return USER_PROMPT_TEMPLATE.format(subject=subject_name, question=sample.input, choices=choices_str)
149
+
150
+ def extract_answer(self, prediction, task_state) -> str:
210
151
  """
211
- Parse the model output to get the answer. Could be the best choice index.
152
+ Extract the answer from the prediction based on the task state.
212
153
 
213
154
  Args:
214
- result: Predicted answer from the model. Usually a string for chat.
215
- raw_input_d (dict): The raw input. Depending on the dataset.
216
- eval_type: `checkpoint` or `service` or `custom`. Default is `checkpoint`.
155
+ prediction (str): The model's prediction string
156
+ task_state (dict): The current task state containing metadata
217
157
 
218
158
  Returns:
219
- The parsed answer. Depending on the dataset. Usually a string for chat.
159
+ str: The extracted answer from the prediction
220
160
  """
221
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
222
- return result
223
- else:
224
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
225
-
226
- def match(self, gold: str, pred: str) -> float:
227
- return exact_match(gold=gold, pred=pred)
228
-
229
- def _format_example(self, input_d: dict, include_answer=True):
230
- example = '问题:' + input_d['question']
231
- for choice in self.choices:
232
- example += f'\n{choice}. {input_d[f"{choice}"]}'
161
+ import re
233
162
 
234
- if include_answer:
235
- example += '\n答案: ' + input_d['answer'] + '\n\n'
163
+ # Use regex to find the answer in the format "答案:LETTER"
164
+ match = re.search(r'答案:([A-D])', prediction)
165
+ if match:
166
+ return match.group(1)
236
167
  else:
237
- example += '\n答案: '
238
- return example
168
+ logger.warning(f'No valid answer found in prediction: {prediction}')
169
+ return ''