evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,370 @@
1
+ import json
2
+ import re
3
+ import traceback
4
+ from typing import Any, Dict, List
5
+
6
+ from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.messages.chat_message import ChatMessageUser
10
+ from evalscope.api.metric import Score
11
+ from evalscope.api.model import Model, ModelOutput
12
+ from evalscope.api.registry import register_benchmark
13
+ from evalscope.constants import Tags
14
+ from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
15
+ from evalscope.utils.import_utils import check_import
16
+ from evalscope.utils.logger import get_logger
17
+
18
+ logger = get_logger()
19
+
20
+ SUBJECT_MAPPING = {
21
+ 'simple': 'AST_NON_LIVE',
22
+ 'multiple': 'AST_NON_LIVE',
23
+ 'parallel': 'AST_NON_LIVE',
24
+ 'parallel_multiple': 'AST_NON_LIVE',
25
+ 'java': 'AST_NON_LIVE',
26
+ 'javascript': 'AST_NON_LIVE',
27
+ 'live_simple': 'AST_LIVE',
28
+ 'live_multiple': 'AST_LIVE',
29
+ 'live_parallel': 'AST_LIVE',
30
+ 'live_parallel_multiple': 'AST_LIVE',
31
+ 'irrelevance': 'RELEVANCE',
32
+ 'live_relevance': 'RELEVANCE',
33
+ 'live_irrelevance': 'RELEVANCE',
34
+ 'multi_turn_base': 'MULTI_TURN',
35
+ 'multi_turn_miss_func': 'MULTI_TURN',
36
+ 'multi_turn_miss_param': 'MULTI_TURN',
37
+ 'multi_turn_long_context': 'MULTI_TURN'
38
+ }
39
+
40
+ BFCL_V3_TO_V4_SUBJECT_MAPPING = {
41
+ 'simple': 'simple_python',
42
+ 'java': 'simple_java',
43
+ 'javascript': 'simple_javascript',
44
+ }
45
+
46
+
47
+ @register_benchmark(
48
+ BenchmarkMeta(
49
+ name='bfcl_v3',
50
+ pretty_name='BFCL-v3',
51
+ tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
52
+ description='Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive '
53
+ 'and executable function call evaluation** '
54
+ 'dedicated to assessing Large Language Models\' (LLMs) ability to invoke '
55
+ 'functions. Unlike previous evaluations, '
56
+ 'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
57
+ 'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
58
+ '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v3.html)',
59
+ dataset_id='AI-ModelScope/bfcl_v3',
60
+ subset_list=list(SUBJECT_MAPPING.keys()),
61
+ metric_list=['acc'],
62
+ eval_split='train',
63
+ extra_params={
64
+ 'underscore_to_dot': True,
65
+ 'is_fc_model': True,
66
+ }
67
+ )
68
+ )
69
+ class BFCLV3Adapter(AgentAdapter):
70
+ """
71
+ BFCL adapter using the new data processing framework.
72
+ """
73
+
74
+ def __init__(self, **kwargs):
75
+ super().__init__(**kwargs)
76
+
77
+ check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
78
+
79
+ self.category_map = SUBJECT_MAPPING
80
+ self.reformat_subset = True
81
+ self.add_overall_metric = False
82
+ self.add_aggregation_name = False
83
+
84
+ self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
85
+ self.is_fc_model = self.extra_params.get('is_fc_model', True)
86
+
87
+ def preprocess_row(self, row: dict):
88
+ """
89
+ Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
90
+ """
91
+ row['should_execute_tool_calls'] = True if row['multi_turn'] else False
92
+ row['functions'] = json.loads(row['functions'])
93
+ row['tools'] = json.loads(row['tools'])
94
+ row['turns'] = json.loads(row['turns'])
95
+ row['missing_functions'] = json.loads(row['missed_functions'])
96
+ row['ground_truth'] = json.loads(row.get('ground_truth', '{}'))
97
+ row['initial_config'] = json.loads(row['initial_config'])
98
+ row['is_fc_model'] = self.is_fc_model
99
+
100
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
101
+ """Convert a data record to a Sample object."""
102
+ self.preprocess_row(record)
103
+
104
+ # If the model is a function calling model, we need to remove the system prompt
105
+ if self.is_fc_model:
106
+ turns = record['turns']
107
+ new_turns = []
108
+ for turn_idx, messages in enumerate(turns):
109
+ current_messages = messages.copy()
110
+ if len(current_messages) > 0 and current_messages[0]['role'] == 'system':
111
+ current_messages = current_messages[1:]
112
+ new_turns.append(current_messages)
113
+ record['turns'] = new_turns
114
+
115
+ return Sample(
116
+ input=[ChatMessageUser(content=json.dumps(record['turns']))],
117
+ target=json.dumps(record['ground_truth']), # Will use the record for evaluation
118
+ subset_key=record['subset'],
119
+ metadata=record # Store the full record for evaluation
120
+ )
121
+
122
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
123
+ from .generation import predict
124
+ return predict(model, sample)
125
+
126
+ def match_score(
127
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
128
+ ) -> Score:
129
+ from bfcl_eval.eval_checker.ast_eval.ast_checker import ast_checker
130
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import multi_turn_checker
131
+ from bfcl_eval.model_handler.utils import (
132
+ convert_to_function_call,
133
+ default_decode_ast_prompting,
134
+ default_decode_execute_prompting,
135
+ )
136
+ from bfcl_eval.utils import is_empty_output
137
+
138
+ from .utils import convert_format_language, convert_language
139
+
140
+ score = Score(
141
+ extracted_prediction=filtered_prediction,
142
+ prediction=original_prediction,
143
+ )
144
+
145
+ try:
146
+ # NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
147
+ if self.underscore_to_dot:
148
+ dummy_model = 'gpt-4o-2024-11-20-FC'
149
+ else:
150
+ dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
151
+
152
+ row = task_state.metadata
153
+ test_category = BFCL_V3_TO_V4_SUBJECT_MAPPING.get(row['test_category'], row['test_category'])
154
+
155
+ if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
156
+ error = None
157
+ try:
158
+ if self.is_fc_model:
159
+ decoded_tool_calls = []
160
+ for tool_call in row['generation'][0]:
161
+ name = list(tool_call.keys())[0]
162
+ params = tool_call[name]
163
+ decoded_tool_calls.append({name: params})
164
+ else:
165
+ decoded_tool_calls = default_decode_ast_prompting(
166
+ row['generation'][0][0], convert_format_language(row['language'])
167
+ )
168
+
169
+ # successful decode means valid function call was present
170
+ contains_func_call = True
171
+ if is_empty_output(decoded_tool_calls):
172
+ # Empty output is not considered as a valid function call
173
+ contains_func_call = False
174
+ error = 'Empty decoded output.'
175
+ except Exception:
176
+ contains_func_call = False
177
+ error = f'Failed to decode with traceback: {traceback.format_exc()}'
178
+ finally:
179
+ valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
180
+ score_result = {'valid': valid, 'error_message': error}
181
+
182
+ elif row['multi_turn']:
183
+ # each step might give a list of tool calls and each turn is multi-step
184
+ # and multi-turn has generations of all the turns
185
+ # hence in a multi-turn setting,
186
+ # multi_turn_decoded_generations is a list of list of list of strings
187
+ multi_turn_decoded_generations: list[list[list[str]]] = []
188
+ for single_turn_generations in row['generation']:
189
+ single_turn_decoded_generations: list[list[str]] = []
190
+ for generation in single_turn_generations:
191
+ try:
192
+ if self.is_fc_model:
193
+ tool_calls = convert_to_function_call(generation)
194
+ else:
195
+ tool_calls = default_decode_execute_prompting(generation)
196
+
197
+ single_turn_decoded_generations.append(tool_calls)
198
+ except Exception:
199
+ single_turn_decoded_generations.append([generation])
200
+
201
+ multi_turn_decoded_generations.append(single_turn_decoded_generations)
202
+
203
+ try:
204
+ raw_score_result = multi_turn_checker(
205
+ multi_turn_decoded_generations,
206
+ row['ground_truth'],
207
+ row,
208
+ test_category,
209
+ dummy_model,
210
+ )
211
+ except Exception:
212
+ raw_score_result = {
213
+ 'valid': False,
214
+ 'error_type': 'multi_turn:checker_failed',
215
+ 'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
216
+ }
217
+
218
+ score_result = {
219
+ 'valid': float(raw_score_result['valid']),
220
+ 'error_message': raw_score_result.get('error_message', ''),
221
+ 'error_type': raw_score_result.get('error_type', ''),
222
+ }
223
+ else:
224
+ try:
225
+ if self.is_fc_model:
226
+ decoded_tool_calls = []
227
+ for tool_call in row['generation'][0]:
228
+ name = list(tool_call.keys())[0]
229
+ params = tool_call[name]
230
+ decoded_tool_calls.append({name: params})
231
+ else:
232
+ decoded_tool_calls = default_decode_ast_prompting(
233
+ row['generation'][0][0], convert_format_language(row['language'])
234
+ )
235
+
236
+ score_result = ast_checker(
237
+ row['functions'],
238
+ decoded_tool_calls,
239
+ row['ground_truth'],
240
+ convert_language(row['language']),
241
+ test_category,
242
+ dummy_model,
243
+ )
244
+ except Exception:
245
+ score_result = {
246
+ 'valid': False,
247
+ 'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
248
+ 'error_type': 'ast_decoder:decoder_failed',
249
+ }
250
+
251
+ score.value = {
252
+ 'acc': float(score_result['valid']),
253
+ }
254
+ score.explanation = score_result.get('error_message', 'Evaluation completed')
255
+ score.metadata = {
256
+ 'raw_score_result': score_result,
257
+ 'test_category': test_category,
258
+ 'underscore_to_dot': self.underscore_to_dot,
259
+ 'is_fc_model': self.is_fc_model
260
+ }
261
+ score.main_score_name = 'acc'
262
+
263
+ except Exception:
264
+ logger.error(f'Evaluation failed for sample: {task_state.sample_id}\n{traceback.format_exc()}')
265
+ score.value = {'acc': 0.0}
266
+ score.explanation = 'Evaluation failed with an unexpected error.'
267
+ score.metadata = {'error': traceback.format_exc()}
268
+ score.main_score_name = 'acc'
269
+ return score
270
+
271
+ def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
272
+ """
273
+ Finalize the report generation process. Calculate the overall score.
274
+
275
+ Track the number of each category.
276
+ - step1: simple, java, javascript unweighted average as simple_ast
277
+ - step2.1: simple_ast, multiple, parallel, parallel_multiple unweighted average as ast_non_live
278
+ - step2.2: live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average as ast_live
279
+ - step2.3: irrelevance as hallucination_non_live
280
+ - step2.4: live_irrelevance, live_relevance weighted average as hallucination_live
281
+ - step2.5: multi_turn_base as multi_turn_base
282
+ - step2.6: multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average as multi_turn_augmented
283
+ - step3.1: ast_non_live, hallucination_non_live unweighted average as non_live
284
+ - step3.2: ast_live, hallucination_live weighted average as live
285
+ - step3.3: multi_turn_base, multi_turn_augmented unweighted average as multi_turn
286
+ - step4: non_live, live, multi_turn unweighted average as overall
287
+ Args:
288
+ report (Report): The generated evaluation report.
289
+ output_dir (str): The directory to save the report.
290
+
291
+ Returns:
292
+ None
293
+ """ # noqa: E501
294
+ for metric in report.metrics:
295
+ # Collect all subsets in a dictionary for easy access
296
+ subset_dict: Dict[str, Subset] = {}
297
+ for category in metric.categories:
298
+ for subset in category.subsets:
299
+ subset_dict[subset.name] = subset
300
+
301
+ # Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
302
+ simple_subsets = ['simple', 'java', 'javascript']
303
+ simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
304
+ subset_dict['simple_ast'] = simple_ast
305
+
306
+ # Step 2.1: Calculate ast_non_live
307
+ # (simple_ast, multiple, parallel, parallel_multiple unweighted average)
308
+ ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
309
+ ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
310
+ subset_dict['ast_non_live'] = ast_non_live
311
+
312
+ # Step 2.2: Calculate ast_live
313
+ # (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
314
+ live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
315
+ ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
316
+ subset_dict['ast_live'] = ast_live
317
+
318
+ # Step 2.3: hallucination_non_live (irrelevance)
319
+ if 'irrelevance' in subset_dict:
320
+ subset_dict['hallucination_non_live'] = subset_dict['irrelevance']
321
+ else:
322
+ subset_dict['hallucination_non_live'] = Subset(name='hallucination_non_live', score=0, num=0)
323
+
324
+ # Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
325
+ hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
326
+ hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
327
+ subset_dict['hallucination_live'] = hallucination_live
328
+
329
+ # Step 2.5: multi_turn_base
330
+ if 'multi_turn_base' not in subset_dict:
331
+ subset_dict['multi_turn_base'] = Subset(name='multi_turn_base', score=0, num=0)
332
+
333
+ # Step 2.6: Calculate multi_turn_augmented
334
+ # (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
335
+ multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
336
+ multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
337
+ subset_dict['multi_turn_augmented'] = multi_turn_augmented
338
+
339
+ # Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
340
+ non_live_subsets = ['ast_non_live', 'hallucination_non_live']
341
+ non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
342
+ subset_dict['non_live'] = non_live
343
+
344
+ # Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
345
+ live_agg_subsets = ['ast_live', 'hallucination_live']
346
+ live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
347
+ subset_dict['live'] = live
348
+
349
+ # Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
350
+ multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
351
+ multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
352
+ subset_dict['multi_turn'] = multi_turn
353
+
354
+ # Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
355
+ overall_subsets = ['non_live', 'live', 'multi_turn']
356
+ overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
357
+ subset_dict['overall'] = overall
358
+
359
+ # Add computed scores to the category
360
+ computed_subset_names = ['non_live', 'live', 'multi_turn', 'overall']
361
+
362
+ # Add the computed scores as new subsets in the metric
363
+ dummy_subsets = []
364
+ for subset_name in computed_subset_names:
365
+ if subset_name in subset_dict:
366
+ subset = subset_dict[subset_name]
367
+ subset.name = subset_name.upper()
368
+ dummy_subsets.append(subset)
369
+ dummy_category = Category(name='-', subsets=dummy_subsets)
370
+ metric.categories.append(dummy_category)
@@ -0,0 +1,222 @@
1
+ import json
2
+ import time
3
+ from typing import Any
4
+
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import dict_to_chat_message
7
+ from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput, ModelUsage
8
+ from evalscope.api.tool.tool_info import ToolInfo
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ def predict(model: Model, sample: Sample) -> ModelOutput:
15
+ """Main prediction function for BFCL using the new API framework."""
16
+ # Extract the row data from sample metadata
17
+ row = sample.metadata
18
+ is_fc_model = row.get('is_fc_model', False)
19
+
20
+ if is_fc_model:
21
+ response, model_usage = generate_turn_with_tools(model, row)
22
+ else:
23
+ response, model_usage = generate_turn(model, row)
24
+
25
+ sample.metadata['generation'] = response
26
+ # wrap response with openai types
27
+ return ModelOutput(
28
+ model=model.name,
29
+ choices=[ChatCompletionChoice.from_content(json.dumps(response, ensure_ascii=False, indent=2))],
30
+ model_usage=model_usage,
31
+ time=time.time()
32
+ )
33
+
34
+
35
+ def generate_turn(model: Model, row: dict[str, Any]):
36
+ from bfcl_eval.constants.default_prompts import (
37
+ DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
38
+ MAXIMUM_STEP_LIMIT,
39
+ )
40
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
41
+ from bfcl_eval.model_handler.utils import default_decode_execute_prompting
42
+
43
+ all_model_responses = []
44
+ current_messages = []
45
+ turns = row['turns']
46
+ model_usage = ModelUsage()
47
+
48
+ for turn_idx, messages in enumerate(turns):
49
+ n_steps = 0
50
+ current_responses = []
51
+ current_messages += messages.copy()
52
+
53
+ if str(turn_idx) in row['missing_functions']:
54
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
55
+ new_turn = [{
56
+ 'role':
57
+ 'user',
58
+ 'content':
59
+ DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
60
+ functions=row['missing_functions'][str(turn_idx)]
61
+ ),
62
+ }]
63
+ current_messages += new_turn
64
+
65
+ while True:
66
+ # Create a sample for the current messages
67
+ from evalscope.api.messages.chat_message import dict_to_chat_message
68
+ chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
69
+
70
+ # Get model response using generate method
71
+ model_output = model.generate(chat_messages)
72
+
73
+ # Handle the response based on the model output structure
74
+ message = model_output.message
75
+ if model_output.usage is not None:
76
+ model_usage += model_output.usage
77
+
78
+ current_messages.append(message)
79
+ if isinstance(message, str):
80
+ result = message
81
+ else:
82
+ result = message.text
83
+
84
+ logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
85
+ current_responses.append(result)
86
+
87
+ execute_tools = row.get('should_execute_tool_calls', False)
88
+ if execute_tools:
89
+ try:
90
+ tool_calls = default_decode_execute_prompting(result)
91
+ except Exception:
92
+ tool_calls = None
93
+
94
+ if tool_calls is None:
95
+ break
96
+
97
+ tool_outputs, _ = execute_multi_turn_func_call(
98
+ tool_calls,
99
+ initial_config=row['initial_config'],
100
+ involved_classes=row['involved_classes'],
101
+ model_name='evaluator_loop',
102
+ test_entry_id=row['id'],
103
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
104
+ is_evaL_run=False,
105
+ )
106
+ # Append tool outputs to the current messages
107
+ tool_results = []
108
+ for tool_output, tool_call in zip(tool_outputs, tool_calls):
109
+ tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
110
+ current_messages.append({
111
+ 'role': 'user',
112
+ 'content': repr(tool_results),
113
+ })
114
+ else:
115
+ break
116
+
117
+ n_steps += 1
118
+ if n_steps > MAXIMUM_STEP_LIMIT:
119
+ logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
120
+ break
121
+
122
+ all_model_responses.append(current_responses)
123
+
124
+ return all_model_responses, model_usage
125
+
126
+
127
+ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
128
+ from bfcl_eval.constants.default_prompts import DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC, MAXIMUM_STEP_LIMIT
129
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
130
+ from bfcl_eval.model_handler.utils import convert_to_function_call
131
+
132
+ all_model_responses = []
133
+ current_messages = []
134
+ turns = row['turns']
135
+ model_usage = ModelUsage()
136
+
137
+ for turn_idx, messages in enumerate(turns):
138
+ n_steps = 0
139
+ current_responses = []
140
+ current_messages += messages.copy()
141
+ tools = row['tools']
142
+
143
+ if str(turn_idx) in row['missing_functions']:
144
+ assert len(messages) == 0, 'Holdout turn should not have user message.'
145
+ # inject new functions on the fly
146
+ new_tools = row['missing_functions'][str(turn_idx)]
147
+ for new_tool in new_tools:
148
+ cur_tool = new_tool[0]
149
+ cur_tool['parameters']['type'] = 'object'
150
+ tools.append({
151
+ 'type': 'function',
152
+ 'function': cur_tool,
153
+ })
154
+ new_turn = [{
155
+ 'role': 'user',
156
+ 'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
157
+ }]
158
+ current_messages += new_turn
159
+
160
+ while True:
161
+ # Create a sample for the current messages with tools
162
+ chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
163
+ current_sample = Sample(
164
+ input=chat_messages,
165
+ target='',
166
+ tools=[ToolInfo.model_validate(tool['function']) for tool in tools],
167
+ )
168
+
169
+ # Get model response
170
+ model_output = model.generate(current_sample.input, tools=current_sample.tools)
171
+
172
+ # Handle the response based on the model output structure
173
+ message = model_output.message
174
+ if model_output.usage is not None:
175
+ model_usage += model_output.usage
176
+
177
+ current_messages.append(message)
178
+ if isinstance(message, str):
179
+ model_responses = [message]
180
+ tool_call_strs = None
181
+ elif message.tool_calls:
182
+ model_responses = [{tc.function.name: tc.function.arguments} for tc in message.tool_calls]
183
+ try:
184
+ tool_call_strs = convert_to_function_call(model_responses)
185
+ except Exception as e:
186
+ logger.error(f'Error converting tool calls to function call strings: {e}')
187
+ tool_call_strs = None
188
+ else:
189
+ model_responses = [message.text]
190
+ tool_call_strs = None
191
+
192
+ current_responses.extend(model_responses)
193
+
194
+ execute_tools = row.get('should_execute_tool_calls', False)
195
+ if execute_tools and tool_call_strs is not None:
196
+ tool_outputs, _ = execute_multi_turn_func_call(
197
+ tool_call_strs,
198
+ initial_config=row['initial_config'],
199
+ involved_classes=row['involved_classes'],
200
+ model_name='evaluator_loop',
201
+ test_entry_id=row['id'],
202
+ long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
203
+ is_evaL_run=False,
204
+ )
205
+
206
+ for tc, tool_output in zip(message.tool_calls, tool_outputs, strict=False):
207
+ current_messages.append({
208
+ 'role': 'tool',
209
+ 'tool_call_id': tc.id,
210
+ 'content': json.dumps({'response': tool_output}),
211
+ })
212
+ else:
213
+ break
214
+
215
+ n_steps += 1
216
+ if n_steps > MAXIMUM_STEP_LIMIT:
217
+ logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
218
+ break
219
+
220
+ all_model_responses.append(current_responses)
221
+
222
+ return all_model_responses, model_usage
@@ -0,0 +1,23 @@
1
+ def convert_language(language: str) -> str:
2
+ """Convert language names from BFCL v3 to BFCL v4 naming conventions."""
3
+ from bfcl_eval.constants.enums import Language
4
+ mapping = {
5
+ 'python': Language.PYTHON,
6
+ 'java': Language.JAVA,
7
+ 'javascript': Language.JAVASCRIPT,
8
+ }
9
+ return mapping[language.lower()]
10
+
11
+
12
+ def convert_format_language(format_language: str) -> str:
13
+ """Convert format language names from BFCL v3 to BFCL v4 naming conventions."""
14
+ from bfcl_eval.constants.enums import ReturnFormat
15
+ mapping = {
16
+ 'python': ReturnFormat.PYTHON,
17
+ 'java': ReturnFormat.JAVA,
18
+ 'javascript': ReturnFormat.JAVASCRIPT,
19
+ 'json': ReturnFormat.JSON,
20
+ 'verbose_xml': ReturnFormat.VERBOSE_XML,
21
+ 'concise_xml': ReturnFormat.CONCISE_XML,
22
+ }
23
+ return mapping[format_language.lower()]
File without changes