evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,195 @@
1
+ """
2
+ Data loading and processing utilities for the Evalscope dashboard.
3
+ """
4
+ import glob
5
+ import os
6
+ import pandas as pd
7
+ from typing import Any, Dict, List, Union
8
+
9
+ from evalscope.api.evaluator import CacheManager, ReviewResult
10
+ from evalscope.constants import DataCollection
11
+ from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
12
+ from evalscope.utils.io_utils import OutputsStructure, jsonl_to_list, yaml_to_dict
13
+ from evalscope.utils.logger import get_logger
14
+ from ..constants import DATASET_TOKEN, MODEL_TOKEN, REPORT_TOKEN
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ def scan_for_report_folders(root_path):
20
+ """Scan for folders containing reports subdirectories"""
21
+ logger.debug(f'Scanning for report folders in {root_path}')
22
+ if not os.path.exists(root_path):
23
+ return []
24
+
25
+ reports = []
26
+ # Iterate over all folders in the root path
27
+ for folder in glob.glob(os.path.join(root_path, '*')):
28
+ # Check if reports folder exists
29
+ reports_path = os.path.join(folder, OutputsStructure.REPORTS_DIR)
30
+ if not os.path.exists(reports_path):
31
+ continue
32
+
33
+ # Iterate over all items in reports folder
34
+ for model_item in glob.glob(os.path.join(reports_path, '*')):
35
+ if not os.path.isdir(model_item):
36
+ continue
37
+ datasets = []
38
+ for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
39
+ datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
40
+ datasets = DATASET_TOKEN.join(datasets)
41
+ reports.append(
42
+ f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}'
43
+ )
44
+
45
+ reports = sorted(reports, reverse=True)
46
+ logger.debug(f'reports: {reports}')
47
+ return reports
48
+
49
+
50
+ def process_report_name(report_name: str):
51
+ prefix, report_name = report_name.split(REPORT_TOKEN)
52
+ model_name, datasets = report_name.split(MODEL_TOKEN)
53
+ datasets = datasets.split(DATASET_TOKEN)
54
+ return prefix, model_name, datasets
55
+
56
+
57
+ def load_single_report(root_path: str, report_name: str):
58
+ prefix, model_name, datasets = process_report_name(report_name)
59
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
60
+ report_list = get_report_list([report_path_list])
61
+
62
+ config_files = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))
63
+ if not config_files:
64
+ raise FileNotFoundError(
65
+ f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}'
66
+ )
67
+ task_cfg_path = config_files[0]
68
+ task_cfg = yaml_to_dict(task_cfg_path)
69
+ return report_list, datasets, task_cfg
70
+
71
+
72
+ def load_multi_report(root_path: str, report_names: List[str]):
73
+ report_list = []
74
+ for report_name in report_names:
75
+ prefix, model_name, datasets = process_report_name(report_name)
76
+ report_path_list = os.path.join(root_path, prefix, OutputsStructure.REPORTS_DIR, model_name)
77
+ reports = get_report_list([report_path_list])
78
+ report_list.extend(reports)
79
+ return report_list
80
+
81
+
82
+ def get_acc_report_df(report_list: List[Report]):
83
+ data_dict = []
84
+ for report in report_list:
85
+ if report.name == DataCollection.NAME:
86
+ for metric in report.metrics:
87
+ for category in metric.categories:
88
+ item = {
89
+ ReportKey.model_name: report.model_name,
90
+ ReportKey.dataset_name: '/'.join(category.name),
91
+ ReportKey.score: category.score,
92
+ ReportKey.num: category.num,
93
+ }
94
+ data_dict.append(item)
95
+ else:
96
+ item = {
97
+ ReportKey.model_name: report.model_name,
98
+ ReportKey.dataset_name: report.dataset_name,
99
+ ReportKey.score: report.score,
100
+ ReportKey.num: report.metrics[0].num,
101
+ }
102
+ data_dict.append(item)
103
+ df = pd.DataFrame.from_dict(data_dict, orient='columns')
104
+
105
+ styler = style_df(df, columns=[ReportKey.score])
106
+ return df, styler
107
+
108
+
109
+ def style_df(df: pd.DataFrame, columns: List[str] = None):
110
+ # Apply background gradient to the specified columns
111
+ styler = df.style.background_gradient(subset=columns, cmap='RdYlGn', vmin=0.0, vmax=1.0, axis=0)
112
+ # Format the dataframe with a precision of 4 decimal places
113
+ styler.format(precision=4)
114
+ return styler
115
+
116
+
117
+ def get_compare_report_df(acc_df: pd.DataFrame):
118
+ df = acc_df.pivot_table(index=ReportKey.model_name, columns=ReportKey.dataset_name, values=ReportKey.score)
119
+ df.reset_index(inplace=True)
120
+
121
+ styler = style_df(df)
122
+ return df, styler
123
+
124
+
125
+ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
126
+ df = df[df[ReportKey.dataset_name] == dataset_name]
127
+ styler = style_df(df, columns=[ReportKey.score])
128
+ return df, styler
129
+
130
+
131
+ def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
132
+ for report in report_list:
133
+ if report.dataset_name == dataset_name:
134
+ return report.analysis
135
+ return 'N/A'
136
+
137
+
138
+ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
139
+ # Load review cache
140
+ outputs = OutputsStructure(work_dir, is_make=False)
141
+ cache_manager = CacheManager(outputs, model_name, dataset_name)
142
+ if dataset_name == DataCollection.NAME:
143
+ review_cache_path = cache_manager.get_review_cache_path('default')
144
+ else:
145
+ review_cache_path = cache_manager.get_review_cache_path(subset_name)
146
+ logger.debug(f'review_path: {review_cache_path}')
147
+ review_caches = jsonl_to_list(review_cache_path)
148
+
149
+ ds = []
150
+ for cache in review_caches:
151
+ review_result = ReviewResult.model_validate(cache)
152
+ sample_score = review_result.sample_score
153
+
154
+ if dataset_name == DataCollection.NAME:
155
+ # Filter subset name
156
+ collection_info = sample_score.sample_metadata[DataCollection.INFO]
157
+ sample_dataset_name = collection_info.get('dataset_name', 'default')
158
+ sample_subset_name = collection_info.get('subset_name', 'default')
159
+ if f'{sample_dataset_name}/{sample_subset_name}' != subset_name:
160
+ continue
161
+
162
+ score = sample_score.score
163
+ metadata = sample_score.sample_metadata
164
+ prediction = score.prediction
165
+ target = review_result.target
166
+ extracted_prediction = score.extracted_prediction
167
+ raw_d = {
168
+ 'Index': str(review_result.index),
169
+ 'Input': review_result.input.replace('\n', '\n\n'), # for markdown
170
+ 'Metadata': metadata,
171
+ 'Generated': prediction or '', # Ensure no None value
172
+ 'Gold': target,
173
+ 'Pred': (extracted_prediction if extracted_prediction != prediction else '*Same as Generated*')
174
+ or '', # Ensure no None value
175
+ 'Score': score.model_dump(exclude_none=True),
176
+ 'NScore': normalize_score(score.main_value)
177
+ }
178
+ ds.append(raw_d)
179
+
180
+ df_subset = pd.DataFrame(ds)
181
+ return df_subset
182
+
183
+
184
+ def normalize_score(score):
185
+ try:
186
+ if isinstance(score, bool):
187
+ return 1.0 if score else 0.0
188
+ elif isinstance(score, dict):
189
+ for key in score:
190
+ return float(score[key])
191
+ return 0.0
192
+ else:
193
+ return float(score)
194
+ except (ValueError, TypeError):
195
+ return 0.0
@@ -0,0 +1,12 @@
1
+ # flake8: noqa
2
+ import os
3
+
4
+
5
+ def setup_env(args):
6
+ compat_dsw_gradio(args)
7
+
8
+
9
+ def compat_dsw_gradio(args) -> None:
10
+ if ('JUPYTER_NAME' in os.environ) and ('dsw-'
11
+ in os.environ['JUPYTER_NAME']) and ('GRADIO_ROOT_PATH' not in os.environ):
12
+ os.environ['GRADIO_ROOT_PATH'] = f"/{os.environ['JUPYTER_NAME']}/proxy/{args.server_port}"
@@ -0,0 +1,221 @@
1
+ """
2
+ Localization utilities for the Evalscope dashboard.
3
+ """
4
+ from typing import Any, Dict
5
+
6
+
7
+ def get_sidebar_locale(lang: str) -> Dict[str, str]:
8
+ locale_dict = {
9
+ 'settings': {
10
+ 'zh': '设置',
11
+ 'en': 'Settings'
12
+ },
13
+ 'report_root_path': {
14
+ 'zh': '报告根路径',
15
+ 'en': 'Report Root Path'
16
+ },
17
+ 'select_reports': {
18
+ 'zh': '请选择报告',
19
+ 'en': 'Select Reports'
20
+ },
21
+ 'load_btn': {
22
+ 'zh': '加载并查看',
23
+ 'en': 'Load & View'
24
+ },
25
+ 'note': {
26
+ 'zh': '请选择报告并点击`加载并查看`来查看数据',
27
+ 'en': 'Please select reports and click `Load & View` to view the data'
28
+ },
29
+ 'warning': {
30
+ 'zh': '没有找到报告,请检查路径',
31
+ 'en': 'No reports found, please check the path'
32
+ }
33
+ }
34
+ return {k: v[lang] for k, v in locale_dict.items()}
35
+
36
+
37
+ def get_visualization_locale(lang: str) -> Dict[str, str]:
38
+ locale_dict = {
39
+ 'visualization': {
40
+ 'zh': '可视化',
41
+ 'en': 'Visualization'
42
+ },
43
+ 'single_model': {
44
+ 'zh': '单模型',
45
+ 'en': 'Single Model'
46
+ },
47
+ 'multi_model': {
48
+ 'zh': '多模型',
49
+ 'en': 'Multi Model'
50
+ }
51
+ }
52
+ return {k: v[lang] for k, v in locale_dict.items()}
53
+
54
+
55
+ def get_single_model_locale(lang: str) -> Dict[str, str]:
56
+ locale_dict = {
57
+ 'select_report': {
58
+ 'zh': '选择报告',
59
+ 'en': 'Select Report'
60
+ },
61
+ 'task_config': {
62
+ 'zh': '任务配置',
63
+ 'en': 'Task Config'
64
+ },
65
+ 'datasets_overview': {
66
+ 'zh': '数据集概览',
67
+ 'en': 'Datasets Overview'
68
+ },
69
+ 'dataset_components': {
70
+ 'zh': '数据集组成',
71
+ 'en': 'Dataset Components'
72
+ },
73
+ 'dataset_scores': {
74
+ 'zh': '数据集分数',
75
+ 'en': 'Dataset Scores'
76
+ },
77
+ 'report_analysis': {
78
+ 'zh': '报告智能分析',
79
+ 'en': 'Report Intelligent Analysis'
80
+ },
81
+ 'dataset_scores_table': {
82
+ 'zh': '数据集分数表',
83
+ 'en': 'Dataset Scores Table'
84
+ },
85
+ 'dataset_details': {
86
+ 'zh': '数据集详情',
87
+ 'en': 'Dataset Details'
88
+ },
89
+ 'select_dataset': {
90
+ 'zh': '选择数据集',
91
+ 'en': 'Select Dataset'
92
+ },
93
+ 'model_prediction': {
94
+ 'zh': '模型预测',
95
+ 'en': 'Model Prediction'
96
+ },
97
+ 'select_subset': {
98
+ 'zh': '选择子集',
99
+ 'en': 'Select Subset'
100
+ },
101
+ 'answer_mode': {
102
+ 'zh': '答案模式',
103
+ 'en': 'Answer Mode'
104
+ },
105
+ 'page': {
106
+ 'zh': '页码',
107
+ 'en': 'Page'
108
+ },
109
+ 'score_threshold': {
110
+ 'zh': '分数阈值',
111
+ 'en': 'Score Threshold'
112
+ },
113
+ }
114
+ return {k: v[lang] for k, v in locale_dict.items()}
115
+
116
+
117
+ def get_multi_model_locale(lang: str) -> Dict[str, str]:
118
+ locale_dict = {
119
+ 'select_reports': {
120
+ 'zh': '请选择报告',
121
+ 'en': 'Select Reports'
122
+ },
123
+ 'models_overview': {
124
+ 'zh': '模型概览',
125
+ 'en': 'Models Overview'
126
+ },
127
+ 'model_radar': {
128
+ 'zh': '模型对比雷达',
129
+ 'en': 'Model Comparison Radar'
130
+ },
131
+ 'model_scores': {
132
+ 'zh': '模型对比分数',
133
+ 'en': 'Model Comparison Scores'
134
+ },
135
+ 'model_comparison_details': {
136
+ 'zh': '模型对比详情',
137
+ 'en': 'Model Comparison Details'
138
+ },
139
+ 'select_model_a': {
140
+ 'zh': '选择模型A',
141
+ 'en': 'Select Model A'
142
+ },
143
+ 'select_model_b': {
144
+ 'zh': '选择模型B',
145
+ 'en': 'Select Model B'
146
+ },
147
+ 'select_dataset': {
148
+ 'zh': '选择数据集',
149
+ 'en': 'Select Dataset'
150
+ },
151
+ 'model_predictions': {
152
+ 'zh': '模型预测',
153
+ 'en': 'Model Predictions'
154
+ },
155
+ 'select_subset': {
156
+ 'zh': '选择子集',
157
+ 'en': 'Select Subset'
158
+ },
159
+ 'answer_mode': {
160
+ 'zh': '答案模式',
161
+ 'en': 'Answer Mode'
162
+ },
163
+ 'score_threshold': {
164
+ 'zh': '分数阈值',
165
+ 'en': 'Score Threshold'
166
+ },
167
+ 'comparison_counts': {
168
+ 'zh': '对比统计',
169
+ 'en': 'Comparison Counts'
170
+ },
171
+ 'page': {
172
+ 'zh': '页码',
173
+ 'en': 'Page'
174
+ },
175
+ 'input': {
176
+ 'zh': '输入',
177
+ 'en': 'Input'
178
+ },
179
+ 'gold_answer': {
180
+ 'zh': '标准答案',
181
+ 'en': 'Gold Answer'
182
+ },
183
+ 'score': {
184
+ 'zh': '分数',
185
+ 'en': 'Score'
186
+ },
187
+ 'normalized_score': {
188
+ 'zh': '归一化分数',
189
+ 'en': 'Normalized Score'
190
+ },
191
+ 'prediction': {
192
+ 'zh': '预测',
193
+ 'en': 'Prediction'
194
+ },
195
+ 'generated': {
196
+ 'zh': '生成结果',
197
+ 'en': 'Generated'
198
+ }
199
+ }
200
+ return {k: v[lang] for k, v in locale_dict.items()}
201
+
202
+
203
+ def get_app_locale(lang: str) -> Dict[str, str]:
204
+ locale_dict = {
205
+ 'title': {
206
+ 'zh': '📈 EvalScope 看板',
207
+ 'en': '📈 Evalscope Dashboard'
208
+ },
209
+ 'star_beggar': {
210
+ 'zh':
211
+ '喜欢<a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>就动动手指给我们加个star吧 🥺 ',
212
+ 'en':
213
+ 'If you like <a href=\"https://github.com/modelscope/evalscope\" target=\"_blank\">EvalScope</a>, '
214
+ 'please take a few seconds to star us 🥺 '
215
+ },
216
+ 'note': {
217
+ 'zh': '请选择报告',
218
+ 'en': 'Please select reports'
219
+ }
220
+ }
221
+ return {k: v[lang] for k, v in locale_dict.items()}
@@ -0,0 +1,119 @@
1
+ """
2
+ Text processing utilities for the Evalscope dashboard.
3
+ """
4
+ import json
5
+ import os
6
+ import re
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from evalscope.utils.logger import get_logger
10
+ from ..constants import LATEX_DELIMITERS
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ def convert_markdown_image(text: str):
16
+ if text.startswith('data:image'):
17
+ # Convert base64 image data to a markdown image tag
18
+ image_tag = f'![image]({text})'
19
+ logger.debug(f'Converting base64 image data to markdown: {text[:30]}... -> {image_tag[:40]}...')
20
+ return image_tag
21
+ elif os.path.isfile(text):
22
+ # Convert the image path to a markdown image tag
23
+ if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
24
+ text = os.path.abspath(text)
25
+ image_tag = f'![image](gradio_api/file={text})'
26
+ logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
27
+ return image_tag
28
+ return text
29
+
30
+
31
+ def convert_html_tags(text):
32
+ # match begin label
33
+ text = re.sub(r'<(\w+)>', r'[\1]', text)
34
+ # match end label
35
+ text = re.sub(r'</(\w+)>', r'[/\1]', text)
36
+ return text
37
+
38
+
39
+ def process_string(string: str, max_length: int = 2048) -> str:
40
+ string = convert_html_tags(string) # for display labels e.g.
41
+ if max_length and len(string) > max_length:
42
+ return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
43
+ return string
44
+
45
+
46
+ def dict_to_markdown(data) -> str:
47
+ markdown_lines = []
48
+
49
+ for key, value in data.items():
50
+ bold_key = f'**{key}**'
51
+
52
+ if isinstance(value, list):
53
+ value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
54
+ elif isinstance(value, dict):
55
+ value_str = dict_to_markdown(value)
56
+ else:
57
+ value_str = str(value)
58
+
59
+ value_str = process_string(value_str, max_length=None) # Convert HTML tags but don't truncate
60
+ markdown_line = f'{bold_key}:\n{value_str}'
61
+ markdown_lines.append(markdown_line)
62
+
63
+ return '\n\n'.join(markdown_lines)
64
+
65
+
66
+ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
67
+ """
68
+ Process model prediction output into a formatted string.
69
+
70
+ Args:
71
+ item: The item to process. Can be a string, list, or dictionary.
72
+ max_length: The maximum length of the output string.
73
+
74
+ Returns:
75
+ A formatted string representation of the input.
76
+ """
77
+ if isinstance(item, dict):
78
+ result = dict_to_markdown(item)
79
+ elif isinstance(item, list):
80
+ result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
81
+ else:
82
+ result = str(item)
83
+
84
+ # Apply HTML tag conversion and truncation only at the final output
85
+ if max_length is not None:
86
+ return process_string(result, max_length)
87
+ return result
88
+
89
+
90
+ def process_model_prediction(item: Any, max_length: Optional[int] = None) -> str:
91
+ if isinstance(item, (dict, list)):
92
+ result = json.dumps(item, ensure_ascii=False, indent=2)
93
+ result = f'```json\n{result}\n```'
94
+ else:
95
+ result = str(item)
96
+
97
+ # Apply HTML tag conversion and truncation only at the final output
98
+ if max_length is not None:
99
+ return process_string(result, max_length)
100
+
101
+ return result
102
+
103
+
104
+ def process_json_content(content: Any) -> str:
105
+ """
106
+ Process JSON content to convert it into a markdown-friendly format.
107
+
108
+ Args:
109
+ content (str): The JSON content as a string.
110
+
111
+ Returns:
112
+ str: The processed content formatted for markdown display.
113
+ """
114
+
115
+ if isinstance(content, str):
116
+ content = {'content': content}
117
+
118
+ content_json = json.dumps(content, ensure_ascii=False, indent=2)
119
+ return content_json
@@ -0,0 +1,96 @@
1
+ """
2
+ Visualization utilities for the Evalscope dashboard.
3
+ """
4
+ import numpy as np
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from typing import List
9
+
10
+ from evalscope.constants import DataCollection
11
+ from evalscope.report import Report, ReportKey, get_data_frame
12
+ from evalscope.utils.logger import get_logger
13
+ from ..constants import DEFAULT_BAR_WIDTH, PLOTLY_THEME
14
+
15
+ logger = get_logger()
16
+
17
+
18
+ def plot_single_report_scores(df: pd.DataFrame):
19
+ if df is None:
20
+ return None
21
+ logger.debug(f'df: \n{df}')
22
+ plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
23
+
24
+ width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
25
+ plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
26
+ plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
27
+ return plot
28
+
29
+
30
+ def plot_single_report_sunburst(report_list: List[Report]):
31
+ if report_list[0].name == DataCollection.NAME:
32
+ df = get_data_frame(report_list=report_list)
33
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
34
+ path = categories + [ReportKey.subset_name]
35
+ else:
36
+ df = get_data_frame(report_list=report_list, flatten_metrics=False)
37
+ categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
38
+ path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
39
+ logger.debug(f'df: \n{df}')
40
+ df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
41
+
42
+ plot = px.sunburst(
43
+ df,
44
+ path=path,
45
+ values=ReportKey.num,
46
+ color=ReportKey.score,
47
+ color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
48
+ color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
49
+ template=PLOTLY_THEME,
50
+ maxdepth=4
51
+ )
52
+ plot.update_traces(insidetextorientation='radial')
53
+ plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
54
+ return plot
55
+
56
+
57
+ def plot_single_dataset_scores(df: pd.DataFrame):
58
+ # TODO: add metric radio and replace category name
59
+ plot = px.bar(
60
+ df,
61
+ x=df[ReportKey.metric_name],
62
+ y=df[ReportKey.score],
63
+ color=df[ReportKey.subset_name],
64
+ text=df[ReportKey.score],
65
+ barmode='group'
66
+ )
67
+
68
+ width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
69
+ plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
70
+ plot.update_layout(uniformtext_minsize=12, uniformtext_mode='hide', yaxis=dict(range=[0, 1]), template=PLOTLY_THEME)
71
+ return plot
72
+
73
+
74
+ def plot_multi_report_radar(df: pd.DataFrame):
75
+ fig = go.Figure()
76
+
77
+ grouped = df.groupby(ReportKey.model_name)
78
+ common_datasets = set.intersection(*[set(group[ReportKey.dataset_name]) for _, group in grouped])
79
+
80
+ for model_name, group in grouped:
81
+ common_group = group[group[ReportKey.dataset_name].isin(common_datasets)]
82
+ fig.add_trace(
83
+ go.Scatterpolar(
84
+ r=common_group[ReportKey.score],
85
+ theta=common_group[ReportKey.dataset_name],
86
+ name=model_name,
87
+ fill='toself'
88
+ )
89
+ )
90
+
91
+ fig.update_layout(
92
+ template=PLOTLY_THEME,
93
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
94
+ margin=dict(t=20, l=20, r=20, b=20)
95
+ )
96
+ return fig