evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,5 +1,48 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
2
3
 
3
- from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
4
- from evalscope.report.generator import ReportGenerator
5
- from evalscope.report.utils import Category, Report, ReportKey, Subset
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .combinator import (
8
+ gen_table,
9
+ get_data_frame,
10
+ get_report_list,
11
+ percentage_weighted_average_from_subsets,
12
+ unweighted_average_from_subsets,
13
+ weighted_average_from_subsets,
14
+ )
15
+ from .generator import ReportGenerator
16
+ from .report import Category, Metric, Report, ReportKey, Subset
17
+
18
+ else:
19
+ _import_structure = {
20
+ 'combinator': [
21
+ 'gen_table',
22
+ 'get_data_frame',
23
+ 'get_report_list',
24
+ 'weighted_average_from_subsets',
25
+ 'unweighted_average_from_subsets',
26
+ 'percentage_weighted_average_from_subsets',
27
+ ],
28
+ 'generator': [
29
+ 'ReportGenerator',
30
+ ],
31
+ 'report': [
32
+ 'Category',
33
+ 'Report',
34
+ 'ReportKey',
35
+ 'Subset',
36
+ 'Metric',
37
+ ],
38
+ }
39
+
40
+ import sys
41
+
42
+ sys.modules[__name__] = _LazyModule(
43
+ __name__,
44
+ globals()['__file__'],
45
+ _import_structure,
46
+ module_spec=__spec__,
47
+ extra_objects={},
48
+ )
@@ -4,9 +4,9 @@ import glob
4
4
  import os
5
5
  import pandas as pd
6
6
  from tabulate import tabulate
7
- from typing import List, Tuple
7
+ from typing import Dict, List, Tuple, Union
8
8
 
9
- from evalscope.report.utils import Report
9
+ from evalscope.report.report import Report, Subset
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -32,42 +32,153 @@ def get_report_list(reports_path_list: List[str]) -> List[Report]:
32
32
  return report_list
33
33
 
34
34
 
35
- def get_data_frame(report_list: List[Report],
36
- flatten_metrics: bool = True,
37
- flatten_categories: bool = True) -> pd.DataFrame:
35
+ def get_data_frame(
36
+ report_list: List[Report],
37
+ flatten_metrics: bool = True,
38
+ flatten_categories: bool = True,
39
+ add_overall_metric: bool = False
40
+ ) -> pd.DataFrame:
38
41
  tables = []
39
42
  for report in report_list:
40
- df = report.to_dataframe(flatten_metrics=flatten_metrics, flatten_categories=flatten_categories)
43
+ df = report.to_dataframe(
44
+ flatten_metrics=flatten_metrics,
45
+ flatten_categories=flatten_categories,
46
+ add_overall_metric=add_overall_metric
47
+ )
41
48
  tables.append(df)
42
49
  return pd.concat(tables, ignore_index=True)
43
50
 
44
51
 
45
- def gen_table(reports_path_list: list) -> str:
46
- report_list = get_report_list(reports_path_list)
47
- table = get_data_frame(report_list)
52
+ def gen_table(
53
+ reports_path_list: list[str] = None,
54
+ report_list: list[Report] = None,
55
+ flatten_metrics: bool = True,
56
+ flatten_categories: bool = True,
57
+ add_overall_metric: bool = False
58
+ ) -> str:
59
+ """
60
+ Generates a formatted table from a list of report paths or Report objects.
61
+
62
+ Args:
63
+ reports_path_list (list[str], optional): List of file paths to report files.
64
+ Either this or `report_list` must be provided.
65
+ report_list (list[Report], optional): List of Report objects.
66
+ Either this or `reports_path_list` must be provided.
67
+ flatten_metrics (bool, optional): Whether to flatten the metrics in the output table. Defaults to True.
68
+ flatten_categories (bool, optional): Whether to flatten the categories in the output table. Defaults to True.
69
+ add_overall_metric (bool, optional): Whether to add an overall metric column to the table. Defaults to False.
70
+
71
+ Returns:
72
+ str: A string representation of the table in grid format.
73
+
74
+ Raises:
75
+ AssertionError: If neither `reports_path_list` nor `report_list` is provided.
76
+ """
77
+ assert (reports_path_list is not None) or (report_list is not None), \
78
+ 'Either reports_path_list or report_list must be provided.'
79
+ if report_list is None:
80
+ report_list = get_report_list(reports_path_list)
81
+ # Generate a DataFrame from the report list
82
+ table = get_data_frame(
83
+ report_list,
84
+ flatten_metrics=flatten_metrics,
85
+ flatten_categories=flatten_categories,
86
+ add_overall_metric=add_overall_metric
87
+ )
48
88
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
49
89
 
50
90
 
51
- class ReportsRecorder:
52
- COMMON_DATASET_PATH = []
53
- CUSTOM_DATASET_PATH = []
54
-
55
- def __init__(self, oss_url: str = '', endpoint: str = ''):
56
- pass
57
-
58
-
59
- if __name__ == '__main__':
60
- report_dir_1 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250117_151926'
61
- # report_dir_2 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250107_204445/reports'
62
-
63
- report_table = gen_table([report_dir_1])
64
- print(report_table)
65
-
66
- # ALL VALUES ONLY FOR EXAMPLE
67
- # +--------------------------+-------------------+-------------+
68
- # | Model | CompetitionMath | GSM8K |
69
- # +==========================+===================+=============+
70
- # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
71
- # +--------------------------+-------------------+-------------+
72
- # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
73
- # +--------------------------+-------------------+-------------+
91
+ def weighted_average_from_subsets(
92
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
93
+ ) -> Subset:
94
+ """Calculate weighted average for given subsets.
95
+
96
+ Args:
97
+ subset_names (List[str]): List of subset names to include in the average.
98
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
99
+ new_name (str): Name for the resulting Subset object.
100
+
101
+ Returns:
102
+ Subset: A new Subset object with weighted average score
103
+ """
104
+ total_score = 0
105
+ total_count = 0
106
+ for name in subset_names:
107
+ if name in subset_dict:
108
+ subset = subset_dict[name]
109
+ total_score += subset.score * subset.num
110
+ total_count += subset.num
111
+
112
+ weighted_avg = total_score / total_count if total_count > 0 else 0
113
+ return Subset(name=new_name, score=weighted_avg, num=total_count)
114
+
115
+
116
+ def unweighted_average_from_subsets(
117
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
118
+ ) -> Subset:
119
+ """Calculate unweighted average for given subsets.
120
+
121
+ Args:
122
+ subset_names (List[str]): List of subset names to include in the average.
123
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
124
+ new_name (str): Name for the resulting Subset object.
125
+
126
+ Returns:
127
+ Subset: A new Subset object with unweighted average score
128
+ """
129
+ scores = []
130
+ total_count = 0
131
+ for name in subset_names:
132
+ if name in subset_dict:
133
+ subset = subset_dict[name]
134
+ scores.append(subset.score)
135
+ total_count += subset.num
136
+
137
+ unweighted_avg = sum(scores) / len(scores) if scores else 0
138
+ return Subset(name=new_name, score=unweighted_avg, num=total_count)
139
+
140
+
141
+ def percentage_weighted_average_from_subsets(
142
+ subset_names: List[str], subset_dict: Dict[str, Subset], weights: List[float], new_name: str = ''
143
+ ) -> Subset:
144
+ """Calculate percentage weighted average for given subsets.
145
+
146
+ Args:
147
+ subset_names (List[str]): List of subset names to include in the average.
148
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
149
+ weights (List[float]): The weight for each corresponding accuracy entry.
150
+ Can sum to any positive value – they will be normalised internally.
151
+ new_name (str): Name for the resulting Subset object.
152
+
153
+ Returns:
154
+ Subset: A new Subset object with percentage weighted average score.
155
+ """
156
+ assert len(subset_names) == len(weights), \
157
+ 'The number of subset names must match the number of weights.'
158
+
159
+ valid_subsets = []
160
+ valid_weights = []
161
+ total_count = 0
162
+
163
+ for name, weight in zip(subset_names, weights):
164
+ if name in subset_dict:
165
+ subset = subset_dict[name]
166
+ valid_subsets.append(subset)
167
+ valid_weights.append(weight)
168
+ total_count += subset.num
169
+
170
+ if not valid_subsets:
171
+ return Subset(name=new_name, score=0, num=0)
172
+
173
+ weight_sum = sum(valid_weights)
174
+ assert weight_sum > 0, \
175
+ f"Sum of weights for percentage_weighted_average_from_subsets for '{new_name}' is not positive."
176
+
177
+ # Normalise weights so that they sum to 1.0
178
+ weights_norm = [w / weight_sum for w in valid_weights]
179
+
180
+ total_score = 0
181
+ for subset, weight in zip(valid_subsets, weights_norm):
182
+ total_score += subset.score * weight
183
+
184
+ return Subset(name=new_name, score=total_score, num=total_count)
@@ -1,24 +1,67 @@
1
1
  import pandas as pd
2
2
  from pandas import DataFrame
3
+ from typing import TYPE_CHECKING
3
4
 
4
5
  from evalscope.constants import DataCollection
5
- from evalscope.report.utils import *
6
+ from evalscope.report.report import *
7
+
8
+ if TYPE_CHECKING:
9
+ from evalscope.api.benchmark import DataAdapter
10
+ from evalscope.api.metric import AggScore
6
11
 
7
12
 
8
13
  class ReportGenerator:
9
14
 
10
15
  @staticmethod
11
- def gen_report(subset_score_map: dict, report_name: str, **kwargs) -> Report:
16
+ def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
17
+ metrics_list = []
18
+ for metric_name, group_metric in df.groupby('metric', sort=False):
19
+ categories = []
20
+ for category_name, group_category in group_metric.groupby('categories'):
21
+ subsets = []
22
+ for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name',
23
+ 'subset_name']):
24
+ avg_score = group_subset['score'].mean()
25
+ num = group_subset['score'].count()
26
+ subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
27
+ categories.append(Category(name=category_name, subsets=subsets))
28
+ metrics_list.append(Metric(name=metric_name, categories=categories))
29
+ return Report(
30
+ name=DataCollection.NAME, metrics=metrics_list, dataset_name=all_dataset_name, model_name=model_name
31
+ )
32
+
33
+ @staticmethod
34
+ def generate_report(
35
+ score_dict: Dict[str, List['AggScore']],
36
+ model_name: str,
37
+ data_adapter: 'DataAdapter',
38
+ add_aggregation_name: bool = True
39
+ ) -> Report:
12
40
  """
13
- Generate report for specific dataset.
14
- subset_score_map: e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}, {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}]}
15
- category_map: e.g. {'subset_name': ['category_name1', 'category_name2'], ...}
16
- metric_list: e.g. [{'object': AverageAccuracy, 'name': 'AverageAccuracy'}, {'object': 'WeightedAverageAccuracy', 'name': 'WeightedAverageAccuracy'}]
41
+ Generate a report for a specific dataset based on provided subset scores.
42
+
43
+ Args:
44
+ subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
45
+ ```
46
+ {
47
+ 'subset_name': [
48
+ AggScore={'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
49
+ AggScore={'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
50
+ ],
51
+ ...
52
+ }
53
+ ```
54
+ data_adapter (DataAdapter): An adapter object for data handling.
55
+
56
+ Returns:
57
+ Report: A structured report object containing metrics, categories, and subsets.
58
+
59
+ >>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
17
60
  """ # noqa: E501
18
61
 
19
- dataset_name = kwargs.get('dataset_name', None)
20
- model_name = kwargs.get('model_name', None)
21
- category_map = kwargs.get('category_map', {})
62
+ dataset_name = data_adapter.name
63
+ category_map = data_adapter.category_map
64
+ report_name = f'{model_name}@{dataset_name}'
22
65
 
23
66
  def flatten_subset() -> DataFrame:
24
67
  """
@@ -30,25 +73,32 @@ class ReportGenerator:
30
73
  1 ARC-Challenge 0.5 2 [default] AverageAccuracy
31
74
  """
32
75
  subsets = []
33
- for subset_name, scores in subset_score_map.items():
34
- for score_item in scores:
76
+ for subset_name, agg_scores in score_dict.items():
77
+ for agg_score_item in agg_scores:
35
78
  categories = category_map.get(subset_name, ['default'])
79
+ if add_aggregation_name and agg_score_item.aggregation_name:
80
+ metric_name = f'{agg_score_item.aggregation_name}_{agg_score_item.metric_name}'
81
+ else:
82
+ metric_name = agg_score_item.metric_name
83
+
36
84
  if isinstance(categories, str):
37
85
  categories = [categories]
38
86
  subsets.append(
39
87
  dict(
40
88
  name=subset_name,
41
- score=score_item['score'],
42
- num=score_item['num'],
43
- metric_name=score_item['metric_name'],
44
- categories=tuple(categories)))
89
+ score=agg_score_item.score,
90
+ num=agg_score_item.num,
91
+ metric_name=metric_name,
92
+ categories=tuple(categories)
93
+ )
94
+ )
45
95
  df = pd.DataFrame(subsets)
46
96
  return df
47
97
 
48
98
  df = flatten_subset()
49
99
 
50
100
  metrics_list = []
51
- for metric_name, group_metric in df.groupby('metric_name'):
101
+ for metric_name, group_metric in df.groupby('metric_name', sort=False):
52
102
  categories = []
53
103
  for category_name, group_category in group_metric.groupby('categories'):
54
104
  subsets = []
@@ -59,22 +109,12 @@ class ReportGenerator:
59
109
 
60
110
  metrics_list.append(Metric(name=metric_name, categories=categories))
61
111
 
62
- report = Report(name=report_name, metrics=metrics_list, dataset_name=dataset_name, model_name=model_name)
112
+ report = Report(
113
+ name=report_name,
114
+ metrics=metrics_list,
115
+ dataset_name=dataset_name,
116
+ model_name=model_name,
117
+ dataset_description=data_adapter.description,
118
+ dataset_pretty_name=data_adapter.pretty_name
119
+ )
63
120
  return report
64
-
65
- @staticmethod
66
- def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
67
- categories = []
68
- for category_name, group_category in df.groupby('categories'):
69
- subsets = []
70
- for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
71
- avg_score = group_subset['score'].mean()
72
- num = group_subset['score'].count()
73
- subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
74
-
75
- categories.append(Category(name=category_name, subsets=subsets))
76
- return Report(
77
- name=DataCollection.NAME,
78
- metrics=[Metric(name='Average', categories=categories)],
79
- dataset_name=all_dataset_name,
80
- model_name=model_name)
@@ -0,0 +1,238 @@
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+ from collections import defaultdict
5
+ from dataclasses import asdict, dataclass, field
6
+ from typing import Any, Dict, List, Union
7
+
8
+ from evalscope.metrics import macro_mean, micro_mean
9
+ from evalscope.utils import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
14
+ 1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
15
+ 2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
16
+ 3. 只列出报告本身,不要有其他多余内容
17
+ 4. 输出报告语言为{language}
18
+
19
+ ```json
20
+ {report_str}
21
+ ```
22
+ """
23
+
24
+
25
+ def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
26
+ """
27
+ Normalize score.
28
+
29
+ Args:
30
+ score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
31
+ keep_num: number of digits to keep.
32
+
33
+ Returns:
34
+ Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
35
+ """
36
+ if isinstance(score, float):
37
+ score = round(score, keep_num)
38
+ elif isinstance(score, dict):
39
+ score = {k: round(v, keep_num) for k, v in score.items()}
40
+ elif isinstance(score, int):
41
+ score = float(score)
42
+ else:
43
+ logger.warning(f'Unknown score type: {type(score)}')
44
+ return score
45
+
46
+
47
+ @dataclass
48
+ class Subset:
49
+ name: str = 'default_subset'
50
+ score: float = 0.0
51
+ num: int = 0
52
+
53
+ def __post_init__(self):
54
+ self.score = normalize_score(self.score)
55
+
56
+
57
+ @dataclass
58
+ class Category:
59
+ name: tuple[str] = field(default_factory=tuple)
60
+ num: int = 0
61
+ score: float = 0.0
62
+ macro_score: float = 0.0
63
+ subsets: List[Subset] = field(default_factory=list)
64
+
65
+ def __post_init__(self):
66
+ if isinstance(self.name, str):
67
+ # ensure name is tuple format
68
+ self.name = (self.name, )
69
+ self.num = sum(subset.num for subset in self.subsets)
70
+ self.score = normalize_score(micro_mean(self.subsets))
71
+ self.macro_score = normalize_score(macro_mean(self.subsets))
72
+
73
+ @classmethod
74
+ def from_dict(cls, data: dict):
75
+ subsets = [Subset(**subset) for subset in data.get('subsets', [])]
76
+ return cls(name=data['name'], subsets=subsets)
77
+
78
+
79
+ @dataclass
80
+ class Metric:
81
+ name: str = 'default_metric'
82
+ num: int = 0
83
+ score: float = 0.0
84
+ macro_score: float = 0.0
85
+ categories: List[Category] = field(default_factory=list)
86
+
87
+ def __post_init__(self):
88
+ self.num = sum(category.num for category in self.categories)
89
+ self.score = normalize_score(micro_mean(self.categories))
90
+ self.macro_score = normalize_score(macro_mean(self.categories))
91
+
92
+ @classmethod
93
+ def from_dict(cls, data: dict):
94
+ categories = [Category.from_dict(category) for category in data.get('categories', [])]
95
+ return cls(name=data['name'], categories=categories)
96
+
97
+
98
+ class ReportKey:
99
+ model_name = 'Model'
100
+ dataset_name = 'Dataset'
101
+ metric_name = 'Metric'
102
+ category_name = 'Category'
103
+ category_prefix = 'Cat.'
104
+ subset_name = 'Subset'
105
+ num = 'Num'
106
+ score = 'Score'
107
+ overall_score = 'OVERALL'
108
+
109
+
110
+ @dataclass
111
+ class Report:
112
+ name: str = 'default_report'
113
+ dataset_name: str = 'default_dataset'
114
+ dataset_pretty_name: str = ''
115
+ dataset_description: str = ''
116
+ model_name: str = 'default_model'
117
+ score: float = 0.0
118
+ metrics: List[Metric] = field(default_factory=list)
119
+ analysis: str = 'N/A'
120
+
121
+ def __post_init__(self):
122
+ self.score = self.metrics[0].score # NOTE: only use the first metric by default
123
+
124
+ def to_dict(self) -> Dict[str, Any]:
125
+ return asdict(self)
126
+
127
+ def to_json_str(self) -> str:
128
+ return json.dumps(self.to_dict(), indent=4, ensure_ascii=False)
129
+
130
+ def to_json(self, json_file: str):
131
+ # ensure the directory exists
132
+ os.makedirs(os.path.dirname(json_file), exist_ok=True)
133
+ # write the report to a json file
134
+ with open(json_file, 'w', encoding='utf-8') as f:
135
+ json.dump(self.to_dict(), f, indent=4, ensure_ascii=False)
136
+
137
+ @classmethod
138
+ def from_dict(cls, data: dict):
139
+ metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
140
+ return cls(
141
+ name=data['name'],
142
+ dataset_name=data['dataset_name'],
143
+ dataset_pretty_name=data.get('dataset_pretty_name'),
144
+ dataset_description=data.get('dataset_description'),
145
+ score=data['score'],
146
+ model_name=data['model_name'],
147
+ metrics=metrics,
148
+ analysis=data.get('analysis', 'N/A'),
149
+ )
150
+
151
+ @classmethod
152
+ def from_json(cls, json_file: str):
153
+ with open(json_file, 'r', encoding='utf-8') as f:
154
+ data = json.load(f)
155
+ return cls.from_dict(data)
156
+
157
+ def to_dataframe(
158
+ self,
159
+ flatten_metrics: bool = True,
160
+ flatten_categories: bool = True,
161
+ add_overall_metric: bool = False
162
+ ) -> pd.DataFrame:
163
+ """
164
+ Convert the report to a pandas DataFrame.
165
+ Args:
166
+ flatten_metrics (bool): Whether to flatten the metrics to a single row.
167
+ flatten_categories (bool): Whether to flatten the categories to multiple rows.
168
+ add_overall_metric (bool): Whether to add an overall metric row.
169
+ Returns:
170
+ pd.DataFrame: The report as a pandas DataFrame.
171
+ """
172
+ table = defaultdict(list)
173
+ for metric in self.metrics:
174
+ metric_count = 0
175
+ for category in metric.categories:
176
+ for subset in category.subsets:
177
+ metric_count += 1
178
+ table[ReportKey.model_name].append(self.model_name)
179
+ table[ReportKey.dataset_name].append(self.dataset_name)
180
+ table[ReportKey.metric_name].append(metric.name)
181
+ table[ReportKey.category_name].append(category.name)
182
+ table[ReportKey.subset_name].append(subset.name)
183
+ table[ReportKey.num].append(subset.num)
184
+ table[ReportKey.score].append(subset.score)
185
+ # add overall metric when there are multiple subsets
186
+ if metric_count > 1 and add_overall_metric and (
187
+ ReportKey.overall_score not in table[ReportKey.subset_name]
188
+ ):
189
+ table[ReportKey.model_name].append(self.model_name)
190
+ table[ReportKey.dataset_name].append(self.dataset_name)
191
+ table[ReportKey.metric_name].append(metric.name)
192
+ table[ReportKey.category_name].append(('-', ))
193
+ table[ReportKey.subset_name].append(ReportKey.overall_score)
194
+ table[ReportKey.num].append(metric.num)
195
+ table[ReportKey.score].append(metric.score)
196
+ # NOTE: only flatten metrics if needed, use the first metric by default
197
+ if not flatten_metrics:
198
+ break
199
+ df = pd.DataFrame.from_dict(table, orient='columns')
200
+ if flatten_categories:
201
+ df = self._flatten_categories(df)
202
+ return df
203
+
204
+ def _flatten_categories(self, df: pd.DataFrame):
205
+ # expand categories to multiple rows
206
+ df_categories = df.copy()
207
+ # multi-level aggregation for categories
208
+ max_depth = df_categories[ReportKey.category_name].apply(len).max()
209
+ for level in range(max_depth):
210
+ df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[
211
+ ReportKey.category_name].apply(lambda x: x[level] if len(x) > level else None)
212
+
213
+ df_categories.drop(columns=[ReportKey.category_name], inplace=True)
214
+ return df_categories
215
+
216
+ def generate_analysis(self, judge_llm_config: dict) -> str:
217
+ import locale
218
+
219
+ from evalscope.metrics import LLMJudge
220
+
221
+ try:
222
+ # get the default locale
223
+ lang, _ = locale.getlocale()
224
+
225
+ if lang is None:
226
+ language = '中文'
227
+ else:
228
+ language = 'en' if lang.startswith('en') else '中文'
229
+
230
+ prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
231
+ judge_llm = LLMJudge(**judge_llm_config)
232
+ response = judge_llm(prompt)
233
+ except Exception as e:
234
+ logger.error(f'Error generating analysis: {e}')
235
+ response = 'N/A'
236
+
237
+ self.analysis = response
238
+ return response