evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -10,8 +10,14 @@ from typing import TYPE_CHECKING
10
10
  from evalscope.report import ReportKey, get_data_frame
11
11
  from evalscope.utils.logger import get_logger
12
12
  from ..constants import LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
13
- from ..utils.data_utils import (get_acc_report_df, get_compare_report_df, get_model_prediction, get_single_dataset_df,
14
- load_multi_report, load_single_report)
13
+ from ..utils.data_utils import (
14
+ get_acc_report_df,
15
+ get_compare_report_df,
16
+ get_model_prediction,
17
+ get_single_dataset_df,
18
+ load_multi_report,
19
+ load_single_report,
20
+ )
15
21
  from ..utils.localization import get_multi_model_locale
16
22
  from ..utils.text_utils import convert_markdown_image, process_model_prediction
17
23
  from ..utils.visualization import plot_multi_report_radar
@@ -62,7 +68,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
62
68
  label=locale_dict.get('answer_mode'),
63
69
  choices=['All', 'Pass A & B', 'Fail A & B', 'Pass A, Fail B', 'Fail A, Pass B'],
64
70
  value='All',
65
- interactive=True)
71
+ interactive=True
72
+ )
66
73
  score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
67
74
 
68
75
  data_comparison_df = gr.State(None)
@@ -75,7 +82,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
75
82
  comparison_counts = gr.Markdown('')
76
83
  with gr.Column():
77
84
  page_number = gr.Number(
78
- value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True)
85
+ value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
86
+ )
79
87
 
80
88
  # Input and Gold answer sections remain at the top
81
89
  with gr.Row(variant='panel'):
@@ -133,7 +141,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
133
141
 
134
142
  @multi_report_name.change(
135
143
  inputs=[sidebar.root_path, multi_report_name],
136
- outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select])
144
+ outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select]
145
+ )
137
146
  def update_multi_report_data(root_path, multi_report_names):
138
147
  if not multi_report_names:
139
148
  return gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
@@ -147,13 +156,14 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
147
156
  model_choices = multi_report_names
148
157
 
149
158
  return report_list, report_radar_plot, styler, gr.update(
150
- choices=model_choices, value=model_choices[0]), gr.update(
151
- choices=model_choices, value=model_choices[1] if len(model_choices) > 1 else None)
159
+ choices=model_choices, value=model_choices[0]
160
+ ), gr.update(choices=model_choices, value=model_choices[1] if len(model_choices) > 1 else None)
152
161
 
153
162
  @gr.on(
154
163
  triggers=[model_a_select.change, model_b_select.change],
155
164
  inputs=[sidebar.root_path, model_a_select, model_b_select],
156
- outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio])
165
+ outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio]
166
+ )
157
167
  def update_selected_models(root_path, model_a, model_b):
158
168
  if not model_a or not model_b:
159
169
  return gr.skip()
@@ -172,13 +182,16 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
172
182
  model_a_name = model_a.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
173
183
  model_b_name = model_b.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
174
184
 
175
- return (model_a_reports, model_b_reports, model_a_dir, model_b_dir, model_a_name, model_b_name,
176
- gr.update(choices=common_datasets, value=common_datasets[0] if common_datasets else None))
185
+ return (
186
+ model_a_reports, model_b_reports, model_a_dir, model_b_dir, model_a_name, model_b_name,
187
+ gr.update(choices=common_datasets, value=common_datasets[0] if common_datasets else None)
188
+ )
177
189
 
178
190
  @gr.on(
179
191
  triggers=[dataset_radio.change],
180
192
  inputs=[dataset_radio, model_a_report, model_b_report],
181
- outputs=[subset_select, data_comparison_df])
193
+ outputs=[subset_select, data_comparison_df]
194
+ )
182
195
  def update_dataset_comparison(dataset_name, model_a_report, model_b_report):
183
196
  if not dataset_name or model_a_report is None or model_b_report is None:
184
197
  return gr.skip()
@@ -198,7 +211,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
198
211
  @gr.on(
199
212
  triggers=[subset_select.change],
200
213
  inputs=[model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio, subset_select],
201
- outputs=[data_comparison_df, page_number])
214
+ outputs=[data_comparison_df, page_number]
215
+ )
202
216
  def update_comparison_data(model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_name, subset_name):
203
217
  if not subset_name or not dataset_name:
204
218
  return gr.skip()
@@ -230,7 +244,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
230
244
  @gr.on(
231
245
  triggers=[data_comparison_df.change, answer_mode_radio.change, score_threshold.change],
232
246
  inputs=[data_comparison_df, answer_mode_radio, score_threshold],
233
- outputs=[filtered_comparison_df, page_number, comparison_counts])
247
+ outputs=[filtered_comparison_df, page_number, comparison_counts]
248
+ )
234
249
  def filter_comparison_data(comparison_df, answer_mode, score_threshold):
235
250
  if comparison_df is None:
236
251
  return None, gr.update(value=1, maximum=1), ''
@@ -256,13 +271,19 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
256
271
  # Count statistics
257
272
  pass_a_count = len(comparison_df[comparison_df['A_NScore'] >= score_threshold])
258
273
  pass_b_count = len(comparison_df[comparison_df['B_NScore'] >= score_threshold])
259
- pass_both_count = len(comparison_df[(comparison_df['A_NScore'] >= score_threshold)
260
- & (comparison_df['B_NScore'] >= score_threshold)])
261
- fail_both_count = len(comparison_df[(comparison_df['A_NScore'] < score_threshold)
262
- & (comparison_df['B_NScore'] < score_threshold)])
263
-
264
- counts_text = (f'### All: {all_count} | Pass A: {pass_a_count} | Pass B: {pass_b_count} | '
265
- f'Pass Both: {pass_both_count} | Fail Both: {fail_both_count}')
274
+ pass_both_count = len(
275
+ comparison_df[(comparison_df['A_NScore'] >= score_threshold)
276
+ & (comparison_df['B_NScore'] >= score_threshold)]
277
+ )
278
+ fail_both_count = len(
279
+ comparison_df[(comparison_df['A_NScore'] < score_threshold)
280
+ & (comparison_df['B_NScore'] < score_threshold)]
281
+ )
282
+
283
+ counts_text = (
284
+ f'### All: {all_count} | Pass A: {pass_a_count} | Pass B: {pass_b_count} | '
285
+ f'Pass Both: {pass_both_count} | Fail Both: {fail_both_count}'
286
+ )
266
287
 
267
288
  max_page = max(1, len(filtered_df))
268
289
 
@@ -277,9 +298,11 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
277
298
  outputs=[
278
299
  input_text, gold_text, model_a_generated, model_a_pred, model_a_score, model_a_nscore, model_b_generated,
279
300
  model_b_pred, model_b_score, model_b_nscore
280
- ])
281
- def update_comparison_display(filtered_df, page_number, score_threshold, model_a_select, model_b_select,
282
- model_a_name_val, model_b_name_val):
301
+ ]
302
+ )
303
+ def update_comparison_display(
304
+ filtered_df, page_number, score_threshold, model_a_select, model_b_select, model_a_name_val, model_b_name_val
305
+ ):
283
306
  if filtered_df is None or len(filtered_df) == 0:
284
307
  return '', '', '', '', '', '', '', '', '', ''
285
308
 
@@ -317,7 +340,9 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
317
340
  else:
318
341
  b_nscore_html = f"<div style='background-color:rgb(151, 31, 44); padding:10px;'>{b_nscore_val}</div>"
319
342
 
320
- return (input_md, gold_md, a_generated_md, a_pred_md, a_score_md, a_nscore_html, b_generated_md, b_pred_md,
321
- b_score_md, b_nscore_html)
343
+ return (
344
+ input_md, gold_md, a_generated_md, a_pred_md, a_score_md, a_nscore_html, b_generated_md, b_pred_md,
345
+ b_score_md, b_nscore_html
346
+ )
322
347
 
323
348
  return MultiModelComponents(multi_report_name=multi_report_name)
@@ -10,8 +10,13 @@ from typing import TYPE_CHECKING
10
10
  from evalscope.report import Report, ReportKey, get_data_frame
11
11
  from evalscope.utils.logger import get_logger
12
12
  from ..constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
13
- from ..utils.data_utils import (get_acc_report_df, get_model_prediction, get_report_analysis, get_single_dataset_df,
14
- load_single_report)
13
+ from ..utils.data_utils import (
14
+ get_acc_report_df,
15
+ get_model_prediction,
16
+ get_report_analysis,
17
+ get_single_dataset_df,
18
+ load_single_report,
19
+ )
15
20
  from ..utils.localization import get_single_model_locale
16
21
  from ..utils.text_utils import convert_markdown_image, process_json_content, process_model_prediction
17
22
  from ..utils.visualization import plot_single_dataset_scores, plot_single_report_scores, plot_single_report_sunburst
@@ -63,7 +68,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
63
68
 
64
69
  with gr.Row():
65
70
  answer_mode_radio = gr.Radio(
66
- label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
71
+ label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True
72
+ )
67
73
  score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
68
74
 
69
75
  data_review_df = gr.State(None)
@@ -76,7 +82,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
76
82
  answer_mode_counts = gr.Markdown('')
77
83
  with gr.Column():
78
84
  page_number = gr.Number(
79
- value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True)
85
+ value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
86
+ )
80
87
 
81
88
  # show data review table
82
89
  with gr.Row(variant='panel'):
@@ -98,14 +105,15 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
98
105
  with gr.Row(variant='panel'):
99
106
  with gr.Column():
100
107
  gr.Markdown('### *Input*')
101
- input_text = gr.Code('', elem_id='input_text', language='json', wrap_lines=False)
108
+ input_text = gr.Markdown('', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS)
102
109
  with gr.Column():
103
110
  gr.Markdown('### *Generated*')
104
111
  generated_text = gr.Markdown('', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS)
105
112
 
106
113
  @report_name.change(
107
114
  inputs=[sidebar.root_path, report_name],
108
- outputs=[report_list, task_config, dataset_radio, work_dir, model_name])
115
+ outputs=[report_list, task_config, dataset_radio, work_dir, model_name]
116
+ )
109
117
  def update_single_report_data(root_path, report_name):
110
118
  report_list, datasets, task_cfg = load_single_report(root_path, report_name)
111
119
  work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
@@ -122,7 +130,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
122
130
  @gr.on(
123
131
  triggers=[dataset_radio.change, report_list.change],
124
132
  inputs=[dataset_radio, report_list],
125
- outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
133
+ outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis]
134
+ )
126
135
  def update_single_report_dataset(dataset_name, report_list):
127
136
  logger.debug(f'Updating single report dataset: {dataset_name}')
128
137
  report_df = get_data_frame(report_list=report_list)
@@ -136,7 +145,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
136
145
  @gr.on(
137
146
  triggers=[subset_select.change],
138
147
  inputs=[work_dir, model_name, dataset_radio, subset_select],
139
- outputs=[data_review_df, page_number])
148
+ outputs=[data_review_df, page_number]
149
+ )
140
150
  def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
141
151
  if not subset_name:
142
152
  return gr.skip()
@@ -146,7 +156,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
146
156
  @gr.on(
147
157
  triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
148
158
  inputs=[data_review_df, answer_mode_radio, score_threshold],
149
- outputs=[filtered_review_df, page_number, answer_mode_counts])
159
+ outputs=[filtered_review_df, page_number, answer_mode_counts]
160
+ )
150
161
  def filter_data(data_review_df, answer_mode, score_threshold):
151
162
  if data_review_df is None:
152
163
  return None, gr.update(value=1, maximum=1), ''
@@ -172,7 +183,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
172
183
  @gr.on(
173
184
  triggers=[filtered_review_df.change, page_number.change],
174
185
  inputs=[filtered_review_df, page_number, score_threshold],
175
- outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore])
186
+ outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore]
187
+ )
176
188
  def update_table_components(filtered_df, page_number, score_threshold):
177
189
  if filtered_df is None or len(filtered_df) == 0:
178
190
  return '', '', '', '', '', ''
@@ -185,10 +197,10 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
185
197
  row = filtered_df.iloc[start]
186
198
 
187
199
  # Process the data for display
188
- input_md = process_json_content(row['Input'])
189
- generated_md = process_model_prediction(row['Generated'])
190
- gold_md = process_model_prediction(row['Gold'])
191
- pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
200
+ input_md = row['Input'] + '\n\n' + process_model_prediction(row['Metadata'])
201
+ generated_md = convert_markdown_image(row['Generated'])
202
+ gold_md = convert_markdown_image(row['Gold'])
203
+ pred_md = process_model_prediction(row['Pred'])
192
204
  score_md = process_json_content(row['Score'])
193
205
  nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
194
206
 
@@ -2,14 +2,14 @@
2
2
  Data loading and processing utilities for the Evalscope dashboard.
3
3
  """
4
4
  import glob
5
- import numpy as np
6
5
  import os
7
6
  import pandas as pd
8
7
  from typing import Any, Dict, List, Union
9
8
 
9
+ from evalscope.api.evaluator import CacheManager, ReviewResult
10
10
  from evalscope.constants import DataCollection
11
11
  from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
12
- from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
12
+ from evalscope.utils.io_utils import OutputsStructure, jsonl_to_list, yaml_to_dict
13
13
  from evalscope.utils.logger import get_logger
14
14
  from ..constants import DATASET_TOKEN, MODEL_TOKEN, REPORT_TOKEN
15
15
 
@@ -39,7 +39,8 @@ def scan_for_report_folders(root_path):
39
39
  datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
40
40
  datasets = DATASET_TOKEN.join(datasets)
41
41
  reports.append(
42
- f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
42
+ f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}'
43
+ )
43
44
 
44
45
  reports = sorted(reports, reverse=True)
45
46
  logger.debug(f'reports: {reports}')
@@ -61,7 +62,8 @@ def load_single_report(root_path: str, report_name: str):
61
62
  config_files = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))
62
63
  if not config_files:
63
64
  raise FileNotFoundError(
64
- f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}')
65
+ f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}'
66
+ )
65
67
  task_cfg_path = config_files[0]
66
68
  task_cfg = yaml_to_dict(task_cfg_path)
67
69
  return report_list, datasets, task_cfg
@@ -134,31 +136,45 @@ def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
134
136
 
135
137
 
136
138
  def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
137
- data_path = os.path.join(work_dir, OutputsStructure.REVIEWS_DIR, model_name)
138
- subset_name = subset_name.replace('/', '_') # for collection report
139
- review_path = os.path.join(data_path, f'{dataset_name}_{subset_name}.jsonl')
140
- logger.debug(f'review_path: {review_path}')
141
- origin_df = pd.read_json(review_path, lines=True)
139
+ # Load review cache
140
+ outputs = OutputsStructure(work_dir, is_make=False)
141
+ cache_manager = CacheManager(outputs, model_name, dataset_name)
142
+ if dataset_name == DataCollection.NAME:
143
+ review_cache_path = cache_manager.get_review_cache_path('default')
144
+ else:
145
+ review_cache_path = cache_manager.get_review_cache_path(subset_name)
146
+ logger.debug(f'review_path: {review_cache_path}')
147
+ review_caches = jsonl_to_list(review_cache_path)
142
148
 
143
149
  ds = []
144
- for i, item in origin_df.iterrows():
145
- raw_input = item['raw_input']
146
- sample_index = item['index']
147
- for choice_index, choice in enumerate(item['choices']):
148
- raw_pred_answer = choice['message']['content']
149
- parsed_gold_answer = choice['review']['gold']
150
- parsed_pred_answer = choice['review']['pred']
151
- score = choice['review']['result']
152
- raw_d = {
153
- 'Index': f'{sample_index}_{choice_index}',
154
- 'Input': raw_input,
155
- 'Generated': raw_pred_answer if raw_pred_answer != parsed_pred_answer else '*Same as Pred*',
156
- 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
157
- 'Pred': parsed_pred_answer,
158
- 'Score': score,
159
- 'NScore': normalize_score(score)
160
- }
161
- ds.append(raw_d)
150
+ for cache in review_caches:
151
+ review_result = ReviewResult.model_validate(cache)
152
+ sample_score = review_result.sample_score
153
+
154
+ if dataset_name == DataCollection.NAME:
155
+ # Filter subset name
156
+ collection_info = sample_score.sample_metadata[DataCollection.INFO]
157
+ sample_dataset_name = collection_info.get('dataset_name', 'default')
158
+ sample_subset_name = collection_info.get('subset_name', 'default')
159
+ if f'{sample_dataset_name}/{sample_subset_name}' != subset_name:
160
+ continue
161
+
162
+ score = sample_score.score
163
+ metadata = sample_score.sample_metadata
164
+ prediction = score.prediction
165
+ target = review_result.target
166
+ extracted_prediction = score.extracted_prediction
167
+ raw_d = {
168
+ 'Index': str(review_result.index),
169
+ 'Input': review_result.input.replace('\n', '\n\n'), # for markdown
170
+ 'Metadata': metadata,
171
+ 'Generated': prediction,
172
+ 'Gold': target,
173
+ 'Pred': extracted_prediction if extracted_prediction != prediction else '*Same as Generated*',
174
+ 'Score': score.model_dump(exclude_none=True),
175
+ 'NScore': normalize_score(score.main_value)
176
+ }
177
+ ds.append(raw_d)
162
178
 
163
179
  df_subset = pd.DataFrame(ds)
164
180
  return df_subset
@@ -0,0 +1,12 @@
1
+ # flake8: noqa
2
+ import os
3
+
4
+
5
+ def setup_env(args):
6
+ compat_dsw_gradio(args)
7
+
8
+
9
+ def compat_dsw_gradio(args) -> None:
10
+ if ('JUPYTER_NAME' in os.environ) and ('dsw-'
11
+ in os.environ['JUPYTER_NAME']) and ('GRADIO_ROOT_PATH' not in os.environ):
12
+ os.environ['GRADIO_ROOT_PATH'] = f"/{os.environ['JUPYTER_NAME']}/proxy/{args.server_port}"
@@ -2,11 +2,9 @@
2
2
  Text processing utilities for the Evalscope dashboard.
3
3
  """
4
4
  import json
5
- import numpy as np
6
5
  import os
7
- import pandas as pd
8
6
  import re
9
- from typing import Any, Dict, List
7
+ from typing import Any, Dict, List, Optional
10
8
 
11
9
  from evalscope.utils.logger import get_logger
12
10
  from ..constants import LATEX_DELIMITERS
@@ -14,15 +12,19 @@ from ..constants import LATEX_DELIMITERS
14
12
  logger = get_logger()
15
13
 
16
14
 
17
- def convert_markdown_image(text):
18
- if not os.path.isfile(text):
19
- return text
20
- # Convert the image path to a markdown image tag
21
- if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
22
- text = os.path.abspath(text)
23
- image_tag = f'![image](gradio_api/file={text})'
24
- logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
15
+ def convert_markdown_image(text: str):
16
+ if text.startswith('data:image'):
17
+ # Convert base64 image data to a markdown image tag
18
+ image_tag = f'![image]({text})'
19
+ logger.debug(f'Converting base64 image data to markdown: {text[:30]}... -> {image_tag[:40]}...')
25
20
  return image_tag
21
+ elif os.path.isfile(text):
22
+ # Convert the image path to a markdown image tag
23
+ if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
24
+ text = os.path.abspath(text)
25
+ image_tag = f'![image](gradio_api/file={text})'
26
+ logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
27
+ return image_tag
26
28
  return text
27
29
 
28
30
 
@@ -85,7 +87,7 @@ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
85
87
  return result
86
88
 
87
89
 
88
- def process_model_prediction(item: Any, max_length: int = 32000) -> str:
90
+ def process_model_prediction(item: Any, max_length: Optional[int] = None) -> str:
89
91
  if isinstance(item, (dict, list)):
90
92
  result = json.dumps(item, ensure_ascii=False, indent=2)
91
93
  result = f'```json\n{result}\n```'
@@ -109,8 +111,6 @@ def process_json_content(content: Any) -> str:
109
111
  Returns:
110
112
  str: The processed content formatted for markdown display.
111
113
  """
112
- if isinstance(content, (np.bool_, np.int_, np.float_)):
113
- content = str(content)
114
114
 
115
115
  if isinstance(content, str):
116
116
  content = {'content': content}
@@ -47,7 +47,8 @@ def plot_single_report_sunburst(report_list: List[Report]):
47
47
  color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
48
48
  color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
49
49
  template=PLOTLY_THEME,
50
- maxdepth=4)
50
+ maxdepth=4
51
+ )
51
52
  plot.update_traces(insidetextorientation='radial')
52
53
  plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
53
54
  return plot
@@ -61,7 +62,8 @@ def plot_single_dataset_scores(df: pd.DataFrame):
61
62
  y=df[ReportKey.score],
62
63
  color=df[ReportKey.subset_name],
63
64
  text=df[ReportKey.score],
64
- barmode='group')
65
+ barmode='group'
66
+ )
65
67
 
66
68
  width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
67
69
  plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
@@ -82,10 +84,13 @@ def plot_multi_report_radar(df: pd.DataFrame):
82
84
  r=common_group[ReportKey.score],
83
85
  theta=common_group[ReportKey.dataset_name],
84
86
  name=model_name,
85
- fill='toself'))
87
+ fill='toself'
88
+ )
89
+ )
86
90
 
87
91
  fig.update_layout(
88
92
  template=PLOTLY_THEME,
89
93
  polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
90
- margin=dict(t=20, l=20, r=20, b=20))
94
+ margin=dict(t=20, l=20, r=20, b=20)
95
+ )
91
96
  return fig
evalscope/arguments.py CHANGED
@@ -1,7 +1,8 @@
1
+ # flake8: noqa: E501
1
2
  import argparse
2
3
  import json
3
4
 
4
- from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, ModelTask, OutputType
5
+ from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
5
6
 
6
7
 
7
8
  class ParseStrArgsAction(argparse.Action):
@@ -47,7 +48,6 @@ def add_argument(parser: argparse.ArgumentParser):
47
48
  parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
48
49
 
49
50
  # Template-related arguments
50
- parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
51
51
  parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
52
52
 
53
53
  # Dataset-related arguments
@@ -60,30 +60,27 @@ def add_argument(parser: argparse.ArgumentParser):
60
60
  parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
61
61
 
62
62
  # Evaluation-related arguments
63
- parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
64
- choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
63
+ parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
65
64
  parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
66
65
  choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
67
66
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
68
- parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
69
- choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
70
- parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
71
67
  parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
68
+ parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
69
+ parser.add_argument('--repeats', type=int, default=1, help='Number of times to repeat the dataset items for k-metrics.') # noqa: E501
72
70
 
73
71
  # Cache and working directory arguments
74
- parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
75
72
  parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
73
+ parser.add_argument('--rerun-review', action='store_true', default=False, help='Rerun the review process when use_cache.')
76
74
  parser.add_argument('--work-dir', type=str, help='The root cache dir.')
77
75
 
78
76
  # Debug and runtime mode arguments
79
77
  parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
80
78
  parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
81
- parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
82
79
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
83
80
  parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
84
81
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
85
82
  parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
86
- parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
83
+ parser.add_argument('--stream', action='store_true', default=None, help='Stream mode.') # noqa: E501
87
84
 
88
85
  # LLMJudge arguments
89
86
  parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
@@ -49,7 +49,8 @@ register_template(
49
49
  reserved_roles=[
50
50
  dict(role='SYSTEM', api_role='SYSTEM'),
51
51
  ],
52
- ))
52
+ )
53
+ )
53
54
 
54
55
  if __name__ == '__main__':
55
56
  res = MetaTemplateType.get_template_name_list()
@@ -47,7 +47,6 @@ class OpenCompassBackendManager(BackendManager):
47
47
  datasets: list, the datasets.
48
48
  models: list, the models.
49
49
  work_dir (Optional): str, the working directory. Default to None, which means the current directory.
50
- dry_run (Optional): bool, the dry-run flag. Default to False.
51
50
  debug (Optional): bool, the debug flag. Default to False.
52
51
  reuse (Optional): str, reuse previous outputs & results. Default to None.
53
52
  generation_kwargs (Optional): dict, the generation config. Default to {}.
@@ -140,7 +139,6 @@ class OpenCompassBackendManager(BackendManager):
140
139
  cmd_str = f'python -m run_oc ' \
141
140
  f'--models {" ".join(self.args.models)} ' \
142
141
  f'--datasets {" ".join(self.args.datasets)} ' \
143
- f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
144
142
  f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
145
143
 
146
144
  elif cmd_mode == CmdMode.SCRIPT:
@@ -182,8 +180,10 @@ class OpenCompassBackendManager(BackendManager):
182
180
  else:
183
181
  valid_dataset_names, invalid_dataset_names = get_valid_list(dataset_names, dataset_names_all)
184
182
  if len(invalid_dataset_names) > 0:
185
- logger.error(f'Invalid datasets: {invalid_dataset_names}, '
186
- f'refer to the following list to get proper dataset name: {dataset_names_all}')
183
+ logger.error(
184
+ f'Invalid datasets: {invalid_dataset_names}, '
185
+ f'refer to the following list to get proper dataset name: {dataset_names_all}'
186
+ )
187
187
  assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
188
188
  f'To get the valid datasets, please refer to {dataset_names_all}'
189
189
 
@@ -252,7 +252,8 @@ if __name__ == '__main__':
252
252
  'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
253
253
  }],
254
254
  'limit': 5
255
- })
255
+ }
256
+ )
256
257
  all_datasets = OpenCompassBackendManager.list_datasets()
257
258
  print(f'all_datasets: {all_datasets}')
258
259
  oc_backend_manager.run()
@@ -100,16 +100,16 @@ class DatasetWrapper(TorchDataset):
100
100
 
101
101
  def get_dataset_default_task(dataset):
102
102
  if dataset in (
103
- 'custom',
104
- 'muge',
105
- 'flickr30k',
106
- 'flickr8k',
107
- 'mscoco_captions',
108
- 'mscoco_captions2017',
109
- 'multilingual_mscoco_captions',
110
- 'flickr30k-200',
111
- 'crossmodal3600',
112
- 'xtd200',
103
+ 'custom',
104
+ 'muge',
105
+ 'flickr30k',
106
+ 'flickr8k',
107
+ 'mscoco_captions',
108
+ 'mscoco_captions2017',
109
+ 'multilingual_mscoco_captions',
110
+ 'flickr30k-200',
111
+ 'crossmodal3600',
112
+ 'xtd200',
113
113
  ):
114
114
  return 'zeroshot_retrieval'
115
115
  else:
@@ -4,8 +4,11 @@ import torch
4
4
  from itertools import product
5
5
 
6
6
  from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
7
- from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (build_dataset, get_dataloader,
8
- get_dataset_default_task)
7
+ from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
8
+ build_dataset,
9
+ get_dataloader,
10
+ get_dataset_default_task,
11
+ )
9
12
  from evalscope.backend.rag_eval.clip_benchmark.tasks import image_caption, zeroshot_classification, zeroshot_retrieval
10
13
  from evalscope.backend.rag_eval.utils.clip import VisionModel
11
14
  from evalscope.utils.logger import get_logger
@@ -66,8 +69,9 @@ def evaluate(args: Arguments):
66
69
  if verbose:
67
70
  logger.info(f'Zero-shot templates: {zeroshot_templates}')
68
71
  classnames = dataset.classes if hasattr(dataset, 'classes') else None
69
- assert (zeroshot_templates is not None
70
- and classnames is not None), 'Dataset does not support classification'
72
+ assert (
73
+ zeroshot_templates is not None and classnames is not None
74
+ ), 'Dataset does not support classification'
71
75
  metrics = zeroshot_classification.evaluate(
72
76
  model,
73
77
  dataloader,
@@ -34,7 +34,8 @@ def rag_eval(args: EvaluationArguments, ) -> None:
34
34
  target_lang=args.language,
35
35
  llm=LangchainLLMWrapper(llm),
36
36
  adapt_instruction=True,
37
- ))
37
+ )
38
+ )
38
39
  # load dataset
39
40
  dataset = Dataset.from_json(args.testset_file)
40
41