evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,229 @@
1
+ import json
2
+ import os
3
+ import traceback
4
+ from copy import deepcopy
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List
7
+
8
+ from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
9
+ from evalscope.api.dataset import Sample
10
+ from evalscope.api.dataset.dataset import DatasetDict
11
+ from evalscope.api.dataset.loader import DictDataLoader
12
+ from evalscope.api.evaluator import TaskState
13
+ from evalscope.api.messages.chat_message import ChatMessageUser
14
+ from evalscope.api.metric import Score
15
+ from evalscope.api.model import Model, ModelOutput
16
+ from evalscope.api.registry import register_benchmark
17
+ from evalscope.constants import Tags
18
+ from evalscope.report import Report
19
+ from evalscope.utils.function_utils import thread_safe
20
+ from evalscope.utils.import_utils import check_import
21
+ from evalscope.utils.logger import get_logger
22
+ from .utils import (
23
+ ALL_SCORING_CATEGORIES,
24
+ compute_aggregate_subsets,
25
+ compute_entry_result,
26
+ load_bfcl_data,
27
+ process_test_entries,
28
+ run_prereq_inference,
29
+ )
30
+
31
+ logger = get_logger()
32
+
33
+
34
+ @register_benchmark(
35
+ BenchmarkMeta(
36
+ name='bfcl_v4',
37
+ pretty_name='BFCL-v4',
38
+ tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
39
+ description='With function-calling being the building blocks of Agents, '
40
+ 'the Berkeley Function-Calling Leaderboard (BFCL) V4 presents a holistic agentic '
41
+ 'evaluation for LLMs. BFCL V4 Agentic includes web search, memory, and format sensitivity. '
42
+ 'Together, the ability to web search, read and write from memory, and the ability to invoke '
43
+ 'functions in different languages present the building blocks for the exciting and extremely '
44
+ 'challenging avenues that power agentic LLMs today from deep-research, to agents for coding and law. '
45
+ 'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
46
+ '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v4.html)',
47
+ dataset_id='https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard',
48
+ subset_list=ALL_SCORING_CATEGORIES,
49
+ metric_list=['acc'],
50
+ eval_split='train',
51
+ extra_params={
52
+ 'underscore_to_dot': True,
53
+ 'is_fc_model': True,
54
+ 'SERPAPI_API_KEY': None,
55
+ }
56
+ )
57
+ )
58
+ class BFCLV4Adapter(AgentAdapter):
59
+ """
60
+ BFCL adapter using the new data processing framework.
61
+ """
62
+
63
+ def __init__(self, **kwargs):
64
+ super().__init__(**kwargs)
65
+
66
+ check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
67
+
68
+ self.add_overall_metric = False
69
+ self.add_aggregation_name = False
70
+
71
+ self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
72
+ self.is_fc_model = self.extra_params.get('is_fc_model', True)
73
+ # Set SERPAPI_API_KEY in environment variables if provided
74
+ serpapi_api_key = self.extra_params.get('SERPAPI_API_KEY', None)
75
+ if serpapi_api_key:
76
+ os.environ['SERPAPI_API_KEY'] = serpapi_api_key
77
+ self.model_result_dir = Path(self._task_config.work_dir) if self._task_config else Path('./bfcl_model_results')
78
+ self.handler = None
79
+ self.prereq_entries = []
80
+ self.prereq_finished = False
81
+
82
+ def load(self):
83
+ """Load and process the BFCL dataset."""
84
+ from bfcl_eval.utils import parse_test_category_argument
85
+ datasets = {}
86
+ all_test_categories = parse_test_category_argument(self.subset_list)
87
+
88
+ test_entries_by_cat, ground_truth_by_cat = load_bfcl_data(all_test_categories)
89
+
90
+ for category in all_test_categories:
91
+ test_entries = test_entries_by_cat.get(category, [])
92
+ ground_truth_entries = ground_truth_by_cat.get(category, [])
93
+
94
+ if not test_entries:
95
+ continue
96
+
97
+ datasets[category] = self._create_dataset_for_category(category, test_entries, ground_truth_entries)
98
+
99
+ test_dataset = DatasetDict(datasets)
100
+ return test_dataset, None
101
+
102
+ def _create_dataset_for_category(
103
+ self, category: str, test_entries: List[Dict], ground_truth_entries: List[Dict]
104
+ ) -> DatasetDict:
105
+ """Create a dataset for a single category by merging test and ground truth data."""
106
+ processed_entries, prereq_entries = process_test_entries(
107
+ category=category,
108
+ test_entries=test_entries,
109
+ ground_truth_entries=ground_truth_entries,
110
+ model_result_dir=self.model_result_dir,
111
+ )
112
+ # collect prereq entries for later prereq inference
113
+ self.prereq_entries.extend(prereq_entries)
114
+
115
+ return DictDataLoader(
116
+ dict_list=processed_entries,
117
+ limit=self.limit,
118
+ repeats=self.repeats,
119
+ sample_fields=self.record_to_sample,
120
+ shuffle=self.shuffle,
121
+ ).load()
122
+
123
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
124
+ """Convert a data record to a Sample object."""
125
+ return Sample(
126
+ input=[ChatMessageUser(content=json.dumps(record['question']))],
127
+ target=json.dumps(record['ground_truth']), # Will use the record for evaluation
128
+ metadata=record # Store the full record for evaluation
129
+ )
130
+
131
+ @thread_safe
132
+ def _init_handler(self):
133
+ if self.handler is not None:
134
+ return # Handler already initialized
135
+
136
+ from bfcl_eval.model_handler.api_inference.openai_completion import OpenAICompletionsHandler
137
+
138
+ # Set env variables for OpenAI API
139
+ os.environ['OPENAI_API_KEY'] = self._task_config.api_key
140
+ os.environ['OPENAI_BASE_URL'] = self._task_config.api_url
141
+
142
+ self.handler = OpenAICompletionsHandler(
143
+ model_name=self._task_config.model,
144
+ temperature=self._task_config.generation_config.temperature,
145
+ registry_name=self._task_config.model_id,
146
+ is_fc_model=self.is_fc_model,
147
+ )
148
+
149
+ self._prereq_inference()
150
+
151
+ def _prereq_inference(self):
152
+ if self.prereq_finished:
153
+ return
154
+ # MOVED: delegate prereq processing to utils
155
+ run_prereq_inference(
156
+ handler=self.handler,
157
+ prereq_entries=self.prereq_entries,
158
+ model_result_dir=self.model_result_dir,
159
+ batch_size=self._task_config.eval_batch_size,
160
+ logger=logger,
161
+ )
162
+ self.prereq_finished = True
163
+
164
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
165
+ try:
166
+ self._init_handler()
167
+
168
+ result, _ = self.handler.inference(
169
+ deepcopy(sample.metadata), include_input_log=False, exclude_state_log=False
170
+ )
171
+
172
+ output = ModelOutput.from_content(
173
+ model=model.name,
174
+ content=json.dumps(result),
175
+ )
176
+ except Exception as e:
177
+ # This is usually the case when the model getting stuck on one particular test case.
178
+ # For example, timeout error or FC model returning invalid JSON response.
179
+ # Since temperature is already set to 0.001, retrying the same test case will not help.
180
+ # So we continue the generation process and record the error message as the model response
181
+ logger.error(f'Error during inference for sample ID {sample.metadata.get("id")}: {e}')
182
+ logger.error(traceback.format_exc())
183
+
184
+ output = ModelOutput.from_content(
185
+ model=model.name,
186
+ content=json.dumps({
187
+ 'error': str(e),
188
+ 'error_message': traceback.format_exc(),
189
+ }),
190
+ )
191
+ return output
192
+
193
+ def match_score(
194
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
195
+ ) -> Score:
196
+ self._init_handler()
197
+
198
+ score = Score(
199
+ extracted_prediction=filtered_prediction,
200
+ prediction=original_prediction,
201
+ )
202
+ model_result = json.loads(filtered_prediction)
203
+ prompt = task_state.metadata
204
+
205
+ entry_result = compute_entry_result(
206
+ handler=self.handler,
207
+ model_result=model_result,
208
+ prompt_entry=prompt,
209
+ underscore_to_dot=self.underscore_to_dot,
210
+ )
211
+
212
+ valid = 1 if entry_result['valid'] else 0
213
+ score.value = {'acc': valid}
214
+ score.metadata = {
215
+ 'valid': bool(entry_result.get('valid')),
216
+ 'error': str(entry_result.get('error')),
217
+ 'error_message': str(entry_result.get('error_message')),
218
+ 'error_type': str(entry_result.get('error_type')),
219
+ }
220
+ return score
221
+
222
+ def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
223
+ """
224
+ Finalize the report generation process. Calculate the overall score.
225
+ """
226
+
227
+ # noqa: E501
228
+ # MOVED: delegate aggregation logic to utils
229
+ compute_aggregate_subsets(report)
@@ -0,0 +1,410 @@
1
+ from __future__ import annotations
2
+
3
+ import traceback
4
+ from collections import defaultdict
5
+ from copy import deepcopy
6
+ from pathlib import Path
7
+ from tqdm import tqdm
8
+ from typing import Any, Dict, List, Tuple
9
+
10
+ from evalscope.report import (
11
+ Category,
12
+ Report,
13
+ Subset,
14
+ percentage_weighted_average_from_subsets,
15
+ unweighted_average_from_subsets,
16
+ weighted_average_from_subsets,
17
+ )
18
+
19
+ # ----------------------------
20
+ # Public constants (extracted)
21
+ # ----------------------------
22
+
23
+ ALL_AVAILABLE_MEMORY_BACKENDS: List[str] = [
24
+ 'kv',
25
+ 'vector',
26
+ 'rec_sum',
27
+ ]
28
+
29
+ NON_LIVE_CATEGORY: List[str] = [
30
+ 'simple_python',
31
+ 'simple_java',
32
+ 'simple_javascript',
33
+ 'multiple',
34
+ 'parallel',
35
+ 'parallel_multiple',
36
+ 'irrelevance',
37
+ ]
38
+ LIVE_CATEGORY: List[str] = [
39
+ 'live_simple',
40
+ 'live_multiple',
41
+ 'live_parallel',
42
+ 'live_parallel_multiple',
43
+ 'live_irrelevance',
44
+ 'live_relevance',
45
+ ]
46
+ MULTI_TURN_CATEGORY: List[str] = [
47
+ 'multi_turn_base',
48
+ 'multi_turn_miss_func',
49
+ 'multi_turn_miss_param',
50
+ 'multi_turn_long_context',
51
+ ]
52
+ WEB_SEARCH_CATEGORY: List[str] = [
53
+ 'web_search_base',
54
+ 'web_search_no_snippet',
55
+ ]
56
+
57
+ MEMORY_CATEGORY: List[str] = [f'memory_{backend}' for backend in ALL_AVAILABLE_MEMORY_BACKENDS]
58
+ MEMORY_SCENARIO_NAME = [
59
+ 'student',
60
+ 'customer',
61
+ 'finance',
62
+ 'healthcare',
63
+ 'notetaker',
64
+ ]
65
+
66
+ SINGLE_TURN_CATEGORY: List[str] = NON_LIVE_CATEGORY + LIVE_CATEGORY
67
+ AGENTIC_CATEGORY: List[str] = MEMORY_CATEGORY + WEB_SEARCH_CATEGORY
68
+
69
+ ALL_SCORING_CATEGORIES: List[str] = SINGLE_TURN_CATEGORY + MULTI_TURN_CATEGORY + AGENTIC_CATEGORY
70
+
71
+ # Dummy models used only to infer underscore_to_dot behavior
72
+ DUMMY_MODEL_UNDERSCORE_TO_DOT = 'gpt-4o-2024-11-20-FC'
73
+ DUMMY_MODEL_NO_UNDERSCORE_TO_DOT = 'meta-llama/Llama-3.3-70B-Instruct-FC'
74
+
75
+ # ----------------------------
76
+ # Data preparation helpers
77
+ # ----------------------------
78
+
79
+
80
+ def load_bfcl_data(categories: List[str]) -> Tuple[Dict[str, List[Dict]], Dict[str, List[Dict]]]:
81
+ """
82
+ Load test entries and ground truth data from bfcl_eval for given categories.
83
+ """
84
+ from bfcl_eval.utils import is_relevance_or_irrelevance, load_dataset_entry, load_ground_truth_entry
85
+
86
+ test_entries_by_cat: Dict[str, List[Dict]] = defaultdict(list)
87
+ ground_truth_by_cat: Dict[str, List[Dict]] = defaultdict(list)
88
+
89
+ for category in categories:
90
+ test_entries_by_cat[category] = load_dataset_entry(
91
+ category, include_prereq=True, include_language_specific_hint=False
92
+ )
93
+ if not is_relevance_or_irrelevance(category):
94
+ ground_truth_by_cat[category] = load_ground_truth_entry(category)
95
+
96
+ return test_entries_by_cat, ground_truth_by_cat
97
+
98
+
99
+ def prepare_ground_truth_map(category: str, ground_truth_entries: List[Dict]) -> Dict[str, Dict]:
100
+ """
101
+ Map ground truth entries to IDs with category-specific adjustments.
102
+ """
103
+ from bfcl_eval.utils import is_memory, is_web_search
104
+
105
+ if not ground_truth_entries:
106
+ return {}
107
+
108
+ if is_memory(category):
109
+ return {entry['id'].replace('memory', category): entry for entry in ground_truth_entries}
110
+ if is_web_search(category):
111
+ return {entry['id'].replace('web_search', category): entry for entry in ground_truth_entries}
112
+ return {entry['id']: entry for entry in ground_truth_entries}
113
+
114
+
115
+ def process_test_entries(
116
+ category: str,
117
+ test_entries: List[Dict[str, Any]],
118
+ ground_truth_entries: List[Dict[str, Any]],
119
+ model_result_dir: Path,
120
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
121
+ """
122
+ Clean and enrich test entries, return processed entries and prereq entries.
123
+
124
+ Returns:
125
+ processed_entries: entries ready to be mapped to Samples
126
+ prereq_entries: entries requiring prereq inference (memory snapshots)
127
+ """
128
+ from bfcl_eval.utils import (
129
+ clean_up_memory_prereq_entries,
130
+ is_memory_prereq,
131
+ populate_initial_settings_for_memory_test_cases,
132
+ populate_initial_settings_for_web_search_test_cases,
133
+ )
134
+
135
+ ground_truth_map = prepare_ground_truth_map(category, ground_truth_entries)
136
+
137
+ test_entries = clean_up_memory_prereq_entries(test_entries)
138
+ test_entries = populate_initial_settings_for_web_search_test_cases(test_entries)
139
+ test_entries = populate_initial_settings_for_memory_test_cases(test_entries, model_result_dir=model_result_dir)
140
+
141
+ prereq_entries = [entry for entry in test_entries if is_memory_prereq(entry['id'])]
142
+ main_entries = [entry for entry in test_entries if not is_memory_prereq(entry['id'])]
143
+
144
+ processed_entries: List[Dict[str, Any]] = []
145
+ for entry in main_entries:
146
+ entry_id = entry['id']
147
+ entry['category'] = category
148
+ entry['ground_truth'] = ground_truth_map.get(entry_id, {}).get('ground_truth', {})
149
+ processed_entries.append(entry)
150
+
151
+ return processed_entries, prereq_entries
152
+
153
+
154
+ def run_prereq_inference(
155
+ handler: Any,
156
+ prereq_entries: List[Dict[str, Any]],
157
+ model_result_dir: Path,
158
+ batch_size: int,
159
+ logger: Any,
160
+ ) -> None:
161
+ """
162
+ Run prerequisite inferences for memory snapshot creation if results are missing.
163
+ Optimized to run different (backend, scenario) groups in parallel while preserving in-group order.
164
+ """
165
+ import re
166
+ from bfcl_eval.utils import get_directory_structure_by_id
167
+ from concurrent.futures import ThreadPoolExecutor, as_completed
168
+
169
+ if not prereq_entries:
170
+ return
171
+
172
+ def _parse_backend_scenario_idx(entry_id: str) -> Tuple[str, str, int]:
173
+ """
174
+ Extract backend, scenario, and scenario index from an entry id.
175
+ Expected format:
176
+ memory_{backend}_prereq_{total_index}-{scenario}-{scenario_index}
177
+ Returns ('unknown', 'unknown', 0) on failure.
178
+ """
179
+ backend = 'unknown'
180
+ scenario = 'unknown'
181
+ idx = 0
182
+
183
+ m_backend = re.search(r'^memory_(?P<backend>.+?)_prereq_', entry_id)
184
+ if m_backend:
185
+ backend = m_backend.group('backend')
186
+
187
+ m_tail = re.search(r'-(?P<scenario>[a-zA-Z_]+)-(?P<idx>\d+)$', entry_id)
188
+ if m_tail:
189
+ scenario = m_tail.group('scenario')
190
+ idx = int(m_tail.group('idx'))
191
+
192
+ return backend, scenario, idx
193
+
194
+ # Group entries by (backend, scenario)
195
+ groups: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
196
+ for entry in prereq_entries:
197
+ backend, scenario, idx = _parse_backend_scenario_idx(entry['id'])
198
+ entry['_group_backend'] = backend
199
+ entry['_group_scenario'] = scenario
200
+ entry['_scenario_idx'] = idx
201
+ groups.setdefault((backend, scenario), []).append(entry)
202
+
203
+ # Sort entries within each group by scenario index to keep order
204
+ for group_entries in groups.values():
205
+ group_entries.sort(key=lambda e: e.get('_scenario_idx', 0))
206
+
207
+ # Worker to process a single (backend, scenario) group sequentially
208
+ def _process_group_entries(group_entries: List[Dict[str, Any]], progress: Any) -> None:
209
+ for entry in group_entries:
210
+ try:
211
+ memory_snapshot_folder = (
212
+ model_result_dir / get_directory_structure_by_id(entry['id']) / 'memory_snapshot'
213
+ / 'prereq_checkpoints'
214
+ )
215
+ existing_filenames = {f.name for f in memory_snapshot_folder.rglob('*.json')}
216
+ if (entry['id'] + '.json') in existing_filenames:
217
+ logger.info(f'Skipping prereq inference for entry ID {entry["id"]} as result already exists.')
218
+ else:
219
+ handler.inference(deepcopy(entry), include_input_log=False, exclude_state_log=False)
220
+ except Exception as e:
221
+ logger.error(f'Error during prereq inference for entry ID {entry.get("id")}: {e}')
222
+ logger.error(traceback.format_exc())
223
+ finally:
224
+ # tqdm is thread-safe; each worker updates shared progress bar
225
+ progress.update(1)
226
+
227
+ # Run each (backend, scenario) group in parallel; preserve in-group order
228
+ total = len(prereq_entries)
229
+ with tqdm(total=total, desc='Running prereq inferences for memory snapshots...') as progress:
230
+ max_workers = min(batch_size, len(groups))
231
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
232
+ futures = [
233
+ executor.submit(_process_group_entries, group_entries, progress) for group_entries in groups.values()
234
+ ]
235
+ for _ in as_completed(futures):
236
+ # Errors are logged within workers
237
+ pass
238
+
239
+ # Cleanup temp keys
240
+ for group_entries in groups.values():
241
+ for entry in group_entries:
242
+ entry.pop('_group_backend', None)
243
+ entry.pop('_group_scenario', None)
244
+ entry.pop('_scenario_idx', None)
245
+
246
+
247
+ # ----------------------------
248
+ # Scoring helpers
249
+ # ----------------------------
250
+
251
+
252
+ def compute_entry_result(
253
+ handler: Any,
254
+ model_result: Any,
255
+ prompt_entry: Dict[str, Any],
256
+ underscore_to_dot: bool,
257
+ ) -> Dict[str, Any]:
258
+ """
259
+ Compute evaluation result for a single entry across BFCL categories.
260
+ """
261
+ from bfcl_eval.constants.enums import Language, ReturnFormat
262
+ from bfcl_eval.eval_checker.eval_runner import (
263
+ _evaluate_single_agentic_entry,
264
+ _evaluate_single_ast_entry,
265
+ _evaluate_single_multi_turn_entry,
266
+ _evaluate_single_relevance_entry,
267
+ )
268
+ from bfcl_eval.utils import is_agentic, is_java, is_js, is_multi_turn, is_relevance_or_irrelevance
269
+
270
+ test_category = prompt_entry['category']
271
+ index = prompt_entry['id']
272
+ ground_truth = prompt_entry.get('ground_truth', {})
273
+
274
+ model_name = (DUMMY_MODEL_UNDERSCORE_TO_DOT if underscore_to_dot else DUMMY_MODEL_NO_UNDERSCORE_TO_DOT)
275
+
276
+ if is_relevance_or_irrelevance(test_category):
277
+ return _evaluate_single_relevance_entry(
278
+ handler=handler,
279
+ index=index,
280
+ model_result_item=model_result,
281
+ prompt_entry=prompt_entry,
282
+ model_name=model_name,
283
+ test_category=test_category,
284
+ )
285
+
286
+ elif is_multi_turn(test_category):
287
+ return _evaluate_single_multi_turn_entry(
288
+ handler=handler,
289
+ test_entry_id=index,
290
+ model_result_list=model_result,
291
+ ground_truth_list=ground_truth,
292
+ prompt_entry=prompt_entry,
293
+ model_name=model_name,
294
+ test_category=test_category,
295
+ )
296
+
297
+ elif is_agentic(test_category):
298
+ return _evaluate_single_agentic_entry(
299
+ handler=handler,
300
+ index=index,
301
+ model_result_list=model_result,
302
+ possible_answer_item=ground_truth,
303
+ prompt_entry=prompt_entry,
304
+ model_name=model_name,
305
+ test_category=test_category,
306
+ )
307
+ else:
308
+ # AST categories (python/java/js)
309
+ if is_java(test_category):
310
+ language = Language.JAVA
311
+ return_format = ReturnFormat.JAVA
312
+ elif is_js(test_category):
313
+ language = Language.JAVASCRIPT
314
+ return_format = ReturnFormat.JAVASCRIPT
315
+ else:
316
+ language = Language.PYTHON
317
+ return_format = ReturnFormat.PYTHON
318
+
319
+ return _evaluate_single_ast_entry(
320
+ handler=handler,
321
+ index=index,
322
+ model_result_item=model_result,
323
+ possible_answer_item=ground_truth,
324
+ prompt_entry=prompt_entry,
325
+ model_name=model_name,
326
+ test_category=test_category,
327
+ language=language,
328
+ return_format=return_format,
329
+ )
330
+
331
+
332
+ # ----------------------------
333
+ # Report aggregation helpers
334
+ # ----------------------------
335
+
336
+
337
+ def compute_aggregate_subsets(report: Report) -> None:
338
+ """
339
+ Compute aggregated subsets and overall score for BFCL report.
340
+ Modifies the report in-place.
341
+ """
342
+ for metric in report.metrics:
343
+ # Collect all subsets in a dictionary for easy access
344
+ subset_dict: Dict[str, Subset] = {}
345
+ for category in metric.categories:
346
+ for subset in category.subsets:
347
+ subset_dict[subset.name] = subset
348
+
349
+ # Step 1: simple_ast
350
+ simple_subsets = ['simple_python', 'simple_java', 'simple_javascript']
351
+ simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
352
+ subset_dict['simple_ast'] = simple_ast
353
+
354
+ # Step 2.1: non_live (simple_ast, multiple, parallel, parallel_multiple)
355
+ non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
356
+ non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
357
+ subset_dict['non_live'] = non_live
358
+
359
+ # Step 2.2: live (weighted)
360
+ live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
361
+ live = weighted_average_from_subsets(live_subsets, subset_dict)
362
+ subset_dict['live'] = live
363
+
364
+ # Step 2.3: hallucination (unweighted)
365
+ hallucination_subsets = ['live_irrelevance', 'irrelevance']
366
+ hallucination = unweighted_average_from_subsets(hallucination_subsets, subset_dict)
367
+ subset_dict['hallucination'] = hallucination
368
+
369
+ # Step 2.4: multi_turn (unweighted)
370
+ multi_turn_subsets = [
371
+ 'multi_turn_base',
372
+ 'multi_turn_miss_func',
373
+ 'multi_turn_miss_param',
374
+ 'multi_turn_long_context',
375
+ ]
376
+ multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
377
+ subset_dict['multi_turn'] = multi_turn
378
+
379
+ # Step 2.5: web_search (unweighted)
380
+ web_search_subsets = ['web_search_base', 'web_search_no_snippet']
381
+ web_search = unweighted_average_from_subsets(web_search_subsets, subset_dict)
382
+ subset_dict['web_search'] = web_search
383
+
384
+ # Step 2.6: memory (unweighted)
385
+ memory_subsets = ['memory_kv', 'memory_vector', 'memory_rec_sum']
386
+ memory = unweighted_average_from_subsets(memory_subsets, subset_dict)
387
+ subset_dict['memory'] = memory
388
+
389
+ # Step 2.7: agentic (unweighted)
390
+ agentic_subsets = ['web_search', 'memory']
391
+ agentic = unweighted_average_from_subsets(agentic_subsets, subset_dict)
392
+ subset_dict['agentic'] = agentic
393
+
394
+ # Step 4: overall (percentage weighted average)
395
+ overall_subsets = ['agentic', 'multi_turn', 'non_live', 'live', 'hallucination']
396
+ overall = percentage_weighted_average_from_subsets(overall_subsets, subset_dict, weights=[40, 30, 10, 10, 10])
397
+ subset_dict['overall'] = overall
398
+
399
+ # Add computed scores to the category
400
+ computed_subset_names = ['agentic', 'multi_turn', 'non_live', 'live', 'hallucination', 'overall']
401
+
402
+ # Add the computed scores as new subsets in the metric
403
+ dummy_subsets: List[Subset] = []
404
+ for subset_name in computed_subset_names:
405
+ if subset_name in subset_dict and subset_dict[subset_name].num > 0:
406
+ subset = subset_dict[subset_name]
407
+ subset.name = subset_name.upper()
408
+ dummy_subsets.append(subset)
409
+ dummy_category = Category(name='-', subsets=dummy_subsets)
410
+ metric.categories.append(dummy_category)
File without changes
@@ -0,0 +1,36 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
2
+ from evalscope.api.dataset import Sample
3
+ from evalscope.api.registry import register_benchmark
4
+ from evalscope.constants import Tags
5
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
6
+
7
+ DESCRIPTION = (
8
+ 'BiomixQA is a curated biomedical question-answering dataset. '
9
+ 'BiomixQA has been utilized to validate the Knowledge Graph based '
10
+ 'Retrieval-Augmented Generation (KG-RAG) framework across different LLMs.'
11
+ ) # noqa: E501
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='biomix_qa',
17
+ pretty_name='BioMixQA',
18
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.MEDICAL],
19
+ description=DESCRIPTION.strip(),
20
+ dataset_id='extraordinarylab/biomix-qa',
21
+ metric_list=['acc'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
26
+ )
27
+ )
28
+ class BioMixQAAdapter(MultiChoiceAdapter):
29
+
30
+ def record_to_sample(self, record) -> Sample:
31
+ return Sample(
32
+ input=record['question'],
33
+ choices=record['choices'],
34
+ target=record['answer'],
35
+ metadata={},
36
+ )
File without changes