evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -42,6 +42,8 @@ async def get_requests(args: Arguments, api_plugin: 'ApiPluginBase') -> AsyncGen
42
42
  try:
43
43
  for messages in message_generator.build_messages():
44
44
  dataset_messages.append(messages)
45
+ if len(dataset_messages) >= args.number:
46
+ break
45
47
  except StopIteration:
46
48
  pass
47
49
 
@@ -189,7 +191,8 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
189
191
  await connect_test(args, api_plugin)
190
192
  # start statistic benchmark metric
191
193
  statistic_benchmark_metric_task = asyncio.create_task(
192
- statistic_benchmark_metric(benchmark_data_queue, args, api_plugin))
194
+ statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
195
+ )
193
196
  # start send request
194
197
  semaphore = asyncio.Semaphore(args.parallel)
195
198
  send_request_tasks: List[asyncio.Task] = []
@@ -26,7 +26,8 @@ class AioHttpClient:
26
26
  self.api_plugin = api_plugin
27
27
  self.client = aiohttp.ClientSession(
28
28
  timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
29
- trace_configs=[self._create_trace_config()] if args.debug else [])
29
+ trace_configs=[self._create_trace_config()] if args.debug else []
30
+ )
30
31
 
31
32
  async def __aenter__(self):
32
33
  pass
@@ -105,7 +106,8 @@ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
105
106
  while True:
106
107
  try:
107
108
  is_error, state_code, response_data = await asyncio.wait_for(
108
- attempt_connection(), timeout=args.connect_timeout)
109
+ attempt_connection(), timeout=args.connect_timeout
110
+ )
109
111
  if not is_error:
110
112
  logger.info('Test connection successful.')
111
113
  return True
@@ -153,7 +153,8 @@ class CustomPlugin(ApiPluginBase):
153
153
 
154
154
  # If no usage information and no tokenizer, raise an error
155
155
  raise ValueError(
156
- 'Cannot determine token counts: no usage information in response and no tokenizer provided.')
156
+ 'Cannot determine token counts: no usage information in response and no tokenizer provided.'
157
+ )
157
158
 
158
159
  except Exception as e:
159
160
  logger.error(f'Error parsing responses: {e}')
@@ -186,8 +187,7 @@ class CustomPlugin(ApiPluginBase):
186
187
  data = json.dumps(body, ensure_ascii=False)
187
188
 
188
189
  # Send the request
189
- async with client_session.request(
190
- 'POST', url=url, data=data, headers=headers) as response: # noqa: E125
190
+ async with client_session.request('POST', url=url, data=data, headers=headers) as response: # noqa: E125
191
191
  # Get the status code
192
192
  status_code = response.status
193
193
 
@@ -244,6 +244,7 @@ if __name__ == '__main__':
244
244
  api='custom', # Use the custom API plugin registered above
245
245
  dataset='openqa',
246
246
  number=1,
247
- max_tokens=10)
247
+ max_tokens=10
248
+ )
248
249
 
249
250
  run_perf_benchmark(args)
@@ -159,13 +159,15 @@ class OpenaiPlugin(DefaultApiPlugin):
159
159
  input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
160
160
  output_tokens += len(self.tokenizer.encode(full_response_content))
161
161
  else:
162
- raise ValueError('Error: Unable to retrieve usage information\n\n'
163
- 'This error occurs when:\n'
164
- '1. The API response does not contain usage data, AND\n'
165
- '2. No tokenizer has been specified or found.\n\n'
166
- 'To resolve this issue, do ONE of the following:\n'
167
- "a) Ensure that the API you're using supports and returns usage information, OR\n"
168
- 'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
169
- 'If you continue to experience issues, '
170
- 'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .')
162
+ raise ValueError(
163
+ 'Error: Unable to retrieve usage information\n\n'
164
+ 'This error occurs when:\n'
165
+ '1. The API response does not contain usage data, AND\n'
166
+ '2. No tokenizer has been specified or found.\n\n'
167
+ 'To resolve this issue, do ONE of the following:\n'
168
+ "a) Ensure that the API you're using supports and returns usage information, OR\n"
169
+ 'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
170
+ 'If you continue to experience issues, '
171
+ 'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
172
+ )
171
173
  return input_tokens, output_tokens
@@ -17,7 +17,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
17
17
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
18
18
  prompt = item.strip()
19
19
  if len(prompt) > self.query_parameters.min_prompt_length and len(
20
- prompt) < self.query_parameters.max_prompt_length:
20
+ prompt
21
+ ) < self.query_parameters.max_prompt_length:
21
22
  if self.query_parameters.apply_chat_template:
22
23
  message = self.create_message(prompt)
23
24
  yield [message]
@@ -24,5 +24,5 @@ class FlickrDatasetPlugin(DatasetPluginBase):
24
24
  text = item['txt']
25
25
  base64_image = PIL_to_base64(pil_image)
26
26
 
27
- message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
28
28
  yield [message]
@@ -24,5 +24,5 @@ class KontextDatasetPlugin(DatasetPluginBase):
24
24
  text = item['instruction']
25
25
  base64_image = PIL_to_base64(pil_image)
26
26
 
27
- message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
28
28
  yield [message]
@@ -18,7 +18,8 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
18
18
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
19
19
  prompt = item.strip()
20
20
  if len(prompt) > self.query_parameters.min_prompt_length and len(
21
- prompt) < self.query_parameters.max_prompt_length:
21
+ prompt
22
+ ) < self.query_parameters.max_prompt_length:
22
23
  if self.query_parameters.apply_chat_template:
23
24
  message = self.create_message(prompt)
24
25
  yield [message]
@@ -23,7 +23,8 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
23
23
  for item in ds:
24
24
  prompt = item['instruction'].strip()
25
25
  if len(prompt) > self.query_parameters.min_prompt_length and len(
26
- prompt) < self.query_parameters.max_prompt_length:
26
+ prompt
27
+ ) < self.query_parameters.max_prompt_length:
27
28
  if self.query_parameters.apply_chat_template:
28
29
  message = self.create_message(prompt)
29
30
  yield [message]
@@ -27,8 +27,10 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
27
27
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
28
28
  item = json.loads(item)
29
29
  prompt = item['question'].strip()
30
- if (len(prompt) > self.query_parameters.min_prompt_length
31
- and len(prompt) < self.query_parameters.max_prompt_length):
30
+ if (
31
+ len(prompt) > self.query_parameters.min_prompt_length
32
+ and len(prompt) < self.query_parameters.max_prompt_length
33
+ ):
32
34
  if self.query_parameters.apply_chat_template:
33
35
  message = self.create_message(prompt)
34
36
  yield [message]
@@ -1,8 +1,8 @@
1
1
  import time
2
- import torch
3
2
  from dataclasses import dataclass, field
4
3
  from typing import Any, List, Optional, Tuple
5
4
 
5
+ from evalscope.utils.import_utils import check_import
6
6
  from evalscope.utils.logger import get_logger
7
7
 
8
8
  logger = get_logger()
@@ -33,8 +33,8 @@ class BenchmarkData:
33
33
  if len(self.chunk_times) > 1:
34
34
  self.first_chunk_latency = self.chunk_times[0] - self.start_time
35
35
  # remove the first chunk time from the total latency
36
- self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (
37
- self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
36
+ self.time_per_output_token = (self.query_latency - self.first_chunk_latency
37
+ ) / (self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
38
38
  self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
39
39
  else:
40
40
  self.first_chunk_latency = self.query_latency
@@ -44,10 +44,13 @@ class BenchmarkData:
44
44
  api_plugin.parse_responses(self.response_messages, request=self.request)
45
45
 
46
46
  def update_gpu_usage(self):
47
- total_memory = 0
48
- for i in range(torch.cuda.device_count()):
49
- total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
50
- self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
47
+ if check_import('torch'):
48
+
49
+ import torch
50
+ total_memory = 0
51
+ for i in range(torch.cuda.device_count()):
52
+ total_memory += (torch.cuda.max_memory_allocated(i) / 2**30) # GB
53
+ self.max_gpu_memory_cost = max(self.max_gpu_memory_cost, total_memory)
51
54
 
52
55
 
53
56
  class Metrics:
@@ -126,11 +129,13 @@ class BenchmarkMetrics:
126
129
  self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
127
130
  self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
128
131
  self.avg_output_token_per_seconds = self.n_total_completion_tokens / self.total_time
129
- self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
130
- + self.n_total_completion_tokens) / self.total_time
132
+ self.avg_total_token_per_seconds = (
133
+ self.n_total_prompt_tokens + self.n_total_completion_tokens
134
+ ) / self.total_time
131
135
  self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
132
136
  self.avg_inter_token_latency = sum(self.n_total_inter_token_latency) / len(
133
- self.n_total_inter_token_latency) if self.n_total_inter_token_latency else 0.0
137
+ self.n_total_inter_token_latency
138
+ ) if self.n_total_inter_token_latency else 0.0
134
139
  self.qps = self.n_succeed_queries / self.total_time
135
140
  except ZeroDivisionError as e:
136
141
  logger.exception(e)
@@ -56,7 +56,8 @@ def transpose_results(data):
56
56
 
57
57
 
58
58
  def create_result_table(cursor):
59
- cursor.execute(f'''CREATE TABLE IF NOT EXISTS result(
59
+ cursor.execute(
60
+ f'''CREATE TABLE IF NOT EXISTS result(
60
61
  {DatabaseColumns.REQUEST} TEXT,
61
62
  {DatabaseColumns.START_TIME} REAL,
62
63
  {DatabaseColumns.CHUNK_TIMES} TEXT,
@@ -69,7 +70,8 @@ def create_result_table(cursor):
69
70
  {DatabaseColumns.COMPLETION_TOKENS} INTEGER,
70
71
  {DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
71
72
  {DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
72
- )''')
73
+ )'''
74
+ )
73
75
 
74
76
 
75
77
  def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
@@ -89,9 +91,10 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
89
91
 
90
92
  if benchmark_data.success:
91
93
  # Add additional columns for success case
92
- additional_columns = (benchmark_data.query_latency, benchmark_data.first_chunk_latency,
93
- benchmark_data.prompt_tokens, benchmark_data.completion_tokens,
94
- benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token)
94
+ additional_columns = (
95
+ benchmark_data.query_latency, benchmark_data.first_chunk_latency, benchmark_data.prompt_tokens,
96
+ benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
97
+ )
95
98
  query = f"""INSERT INTO result(
96
99
  {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
97
100
  {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
@@ -124,7 +127,7 @@ def get_result_db_path(args: Arguments):
124
127
 
125
128
  logger.info(f'Save the data base to: {result_db_path}')
126
129
  if os.path.exists(result_db_path):
127
- logger.warning('The db file exists, delete it and start again!.')
130
+ logger.error(f'The db file {result_db_path} exists, delete it and start again!.')
128
131
  sys.exit(1)
129
132
 
130
133
  return result_db_path
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import subprocess
3
- import torch
4
3
  import uvicorn
5
4
  from contextlib import asynccontextmanager
6
5
  from dataclasses import dataclass
@@ -10,6 +9,7 @@ from sse_starlette.sse import EventSourceResponse
10
9
 
11
10
  from evalscope.perf.arguments import Arguments
12
11
  from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
12
+ from evalscope.utils.import_utils import check_import
13
13
  from evalscope.utils.logger import get_logger
14
14
 
15
15
  logger = get_logger()
@@ -61,8 +61,12 @@ class ServerSentEvent(object):
61
61
  @asynccontextmanager
62
62
  async def lifespan(app: FastAPI):
63
63
  yield
64
- if torch.cuda.is_available():
65
- torch.cuda.empty_cache()
64
+ try:
65
+ import torch
66
+ if torch.cuda.is_available():
67
+ torch.cuda.empty_cache()
68
+ except ImportError:
69
+ pass
66
70
 
67
71
 
68
72
  def create_app(model, attn_implementation=None) -> FastAPI:
@@ -98,10 +102,14 @@ def create_app(model, attn_implementation=None) -> FastAPI:
98
102
  def start_app(args: Arguments):
99
103
  logger.info('Starting local server, please wait...')
100
104
  if args.api == 'local':
105
+ check_import('torch', 'torch', raise_error=True)
106
+
101
107
  app = create_app(args.model, args.attn_implementation)
102
108
  uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
103
109
 
104
110
  elif args.api == 'local_vllm':
111
+ import torch
112
+
105
113
  os.environ['VLLM_USE_MODELSCOPE'] = 'True'
106
114
  os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
107
115
  os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
@@ -32,8 +32,9 @@ def analyze_results(all_results):
32
32
  avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
33
33
  avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
34
34
  p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
35
- success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
36
- / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100
35
+ success_rate = (
36
+ total_metrics.get(Metrics.SUCCEED_REQUESTS, 0) / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)
37
+ ) * 100
37
38
  avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
38
39
  p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
39
40
 
@@ -55,12 +56,13 @@ def analyze_results(all_results):
55
56
  f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
56
57
  ])
57
58
 
58
- total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get(
59
- Metrics.SUCCEED_REQUESTS, 0)
59
+ total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST,
60
+ 0) * total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
60
61
  total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
61
62
  except Exception as e:
62
63
  logger.warning(
63
- f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}")
64
+ f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}"
65
+ )
64
66
  continue
65
67
 
66
68
  if not summary:
@@ -138,7 +140,8 @@ def print_summary(all_results, model_name):
138
140
  f'{float(row[8]):.3f}', # Average TPOT
139
141
  f'{float(row[9]):.3f}', # P99 TPOT
140
142
  row[6], # Success Rate
141
- style=row_style)
143
+ style=row_style
144
+ )
142
145
  except ValueError as e:
143
146
  console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
144
147
  continue
@@ -156,8 +159,9 @@ def print_summary(all_results, model_name):
156
159
  perf_info.add_column('Value', style='green', width=40)
157
160
 
158
161
  perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
159
- perf_info.add_row('Lowest Latency',
160
- f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)')
162
+ perf_info.add_row(
163
+ 'Lowest Latency', f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)'
164
+ )
161
165
 
162
166
  console.print('\n')
163
167
  console.print(perf_info)
@@ -166,7 +170,8 @@ def print_summary(all_results, model_name):
166
170
  recommendations = []
167
171
  if best_rps_idx == len(summary) - 1:
168
172
  recommendations.append(
169
- 'The system seems not to have reached its performance bottleneck, try higher concurrency')
173
+ 'The system seems not to have reached its performance bottleneck, try higher concurrency'
174
+ )
170
175
  elif best_rps_idx == 0:
171
176
  recommendations.append('Consider lowering concurrency, current load may be too high')
172
177
  else:
@@ -175,7 +180,8 @@ def print_summary(all_results, model_name):
175
180
  success_rate = float(summary[-1][6][:-1])
176
181
  if success_rate < 95:
177
182
  recommendations.append(
178
- 'Success rate is low at high concurrency, check system resources or reduce concurrency')
183
+ 'Success rate is low at high concurrency, check system resources or reduce concurrency'
184
+ )
179
185
 
180
186
  recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
181
187
  console.print(recommend_text)
@@ -6,7 +6,7 @@ from evalscope.utils.import_utils import _LazyModule
6
6
  if TYPE_CHECKING:
7
7
  from .combinator import gen_table, get_data_frame, get_report_list
8
8
  from .generator import ReportGenerator
9
- from .utils import Category, Report, ReportKey, Subset
9
+ from .report import Category, Report, ReportKey, Subset
10
10
 
11
11
  else:
12
12
  _import_structure = {
@@ -14,12 +14,11 @@ else:
14
14
  'gen_table',
15
15
  'get_data_frame',
16
16
  'get_report_list',
17
- 'gen_report_table',
18
17
  ],
19
18
  'generator': [
20
19
  'ReportGenerator',
21
20
  ],
22
- 'utils': [
21
+ 'report': [
23
22
  'Category',
24
23
  'Report',
25
24
  'ReportKey',
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from tabulate import tabulate
7
7
  from typing import List, Tuple
8
8
 
9
- from evalscope.report.utils import Report
9
+ from evalscope.report.report import Report
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -32,25 +32,30 @@ def get_report_list(reports_path_list: List[str]) -> List[Report]:
32
32
  return report_list
33
33
 
34
34
 
35
- def get_data_frame(report_list: List[Report],
36
- flatten_metrics: bool = True,
37
- flatten_categories: bool = True,
38
- add_overall_metric: bool = False) -> pd.DataFrame:
35
+ def get_data_frame(
36
+ report_list: List[Report],
37
+ flatten_metrics: bool = True,
38
+ flatten_categories: bool = True,
39
+ add_overall_metric: bool = False
40
+ ) -> pd.DataFrame:
39
41
  tables = []
40
42
  for report in report_list:
41
43
  df = report.to_dataframe(
42
44
  flatten_metrics=flatten_metrics,
43
45
  flatten_categories=flatten_categories,
44
- add_overall_metric=add_overall_metric)
46
+ add_overall_metric=add_overall_metric
47
+ )
45
48
  tables.append(df)
46
49
  return pd.concat(tables, ignore_index=True)
47
50
 
48
51
 
49
- def gen_table(reports_path_list: list[str] = None,
50
- report_list: list[Report] = None,
51
- flatten_metrics: bool = True,
52
- flatten_categories: bool = True,
53
- add_overall_metric: bool = False) -> str:
52
+ def gen_table(
53
+ reports_path_list: list[str] = None,
54
+ report_list: list[Report] = None,
55
+ flatten_metrics: bool = True,
56
+ flatten_categories: bool = True,
57
+ add_overall_metric: bool = False
58
+ ) -> str:
54
59
  """
55
60
  Generates a formatted table from a list of report paths or Report objects.
56
61
 
@@ -78,7 +83,8 @@ def gen_table(reports_path_list: list[str] = None,
78
83
  report_list,
79
84
  flatten_metrics=flatten_metrics,
80
85
  flatten_categories=flatten_categories,
81
- add_overall_metric=add_overall_metric)
86
+ add_overall_metric=add_overall_metric
87
+ )
82
88
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
83
89
 
84
90
 
@@ -3,29 +3,54 @@ from pandas import DataFrame
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  from evalscope.constants import DataCollection
6
- from evalscope.report.utils import *
6
+ from evalscope.report.report import *
7
7
 
8
8
  if TYPE_CHECKING:
9
- from evalscope.benchmarks import DataAdapter
9
+ from evalscope.api.benchmark import DataAdapter
10
+ from evalscope.api.metric import AggScore
10
11
 
11
12
 
12
13
  class ReportGenerator:
13
14
 
14
15
  @staticmethod
15
- def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'DataAdapter', **kwargs) -> Report:
16
+ def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
17
+ metrics_list = []
18
+ for metric_name, group_metric in df.groupby('metric', sort=False):
19
+ categories = []
20
+ for category_name, group_category in group_metric.groupby('categories'):
21
+ subsets = []
22
+ for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name',
23
+ 'subset_name']):
24
+ avg_score = group_subset['score'].mean()
25
+ num = group_subset['score'].count()
26
+ subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
27
+ categories.append(Category(name=category_name, subsets=subsets))
28
+ metrics_list.append(Metric(name=metric_name, categories=categories))
29
+ return Report(
30
+ name=DataCollection.NAME, metrics=metrics_list, dataset_name=all_dataset_name, model_name=model_name
31
+ )
32
+
33
+ @staticmethod
34
+ def generate_report(
35
+ score_dict: Dict[str, List['AggScore']],
36
+ model_name: str,
37
+ data_adapter: 'DataAdapter',
38
+ add_aggregation_name: bool = True
39
+ ) -> Report:
16
40
  """
17
41
  Generate a report for a specific dataset based on provided subset scores.
18
42
 
19
43
  Args:
20
44
  subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
21
- {
22
- 'subset_name': [
23
- {'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
24
- {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
25
- ],
26
- ...
27
- }
28
- report_name (str): The name of the report to generate.
45
+ ```
46
+ {
47
+ 'subset_name': [
48
+ AggScore={'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
49
+ AggScore={'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
50
+ ],
51
+ ...
52
+ }
53
+ ```
29
54
  data_adapter (DataAdapter): An adapter object for data handling.
30
55
 
31
56
  Returns:
@@ -48,18 +73,25 @@ class ReportGenerator:
48
73
  1 ARC-Challenge 0.5 2 [default] AverageAccuracy
49
74
  """
50
75
  subsets = []
51
- for subset_name, scores in subset_score_map.items():
52
- for score_item in scores:
76
+ for subset_name, agg_scores in score_dict.items():
77
+ for agg_score_item in agg_scores:
53
78
  categories = category_map.get(subset_name, ['default'])
79
+ if add_aggregation_name and agg_score_item.aggregation_name:
80
+ metric_name = f'{agg_score_item.aggregation_name}_{agg_score_item.metric_name}'
81
+ else:
82
+ metric_name = agg_score_item.metric_name
83
+
54
84
  if isinstance(categories, str):
55
85
  categories = [categories]
56
86
  subsets.append(
57
87
  dict(
58
88
  name=subset_name,
59
- score=score_item['score'],
60
- num=score_item['num'],
61
- metric_name=score_item['metric_name'],
62
- categories=tuple(categories)))
89
+ score=agg_score_item.score,
90
+ num=agg_score_item.num,
91
+ metric_name=metric_name,
92
+ categories=tuple(categories)
93
+ )
94
+ )
63
95
  df = pd.DataFrame(subsets)
64
96
  return df
65
97
 
@@ -83,22 +115,6 @@ class ReportGenerator:
83
115
  dataset_name=dataset_name,
84
116
  model_name=model_name,
85
117
  dataset_description=data_adapter.description,
86
- dataset_pretty_name=data_adapter.pretty_name)
118
+ dataset_pretty_name=data_adapter.pretty_name
119
+ )
87
120
  return report
88
-
89
- @staticmethod
90
- def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
91
- categories = []
92
- for category_name, group_category in df.groupby('categories'):
93
- subsets = []
94
- for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
95
- avg_score = group_subset['score'].mean()
96
- num = group_subset['score'].count()
97
- subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
98
-
99
- categories.append(Category(name=category_name, subsets=subsets))
100
- return Report(
101
- name=DataCollection.NAME,
102
- metrics=[Metric(name='Average', categories=categories)],
103
- dataset_name=all_dataset_name,
104
- model_name=model_name)
@@ -152,10 +152,12 @@ class Report:
152
152
  data = json.load(f)
153
153
  return cls.from_dict(data)
154
154
 
155
- def to_dataframe(self,
156
- flatten_metrics: bool = True,
157
- flatten_categories: bool = True,
158
- add_overall_metric: bool = False) -> pd.DataFrame:
155
+ def to_dataframe(
156
+ self,
157
+ flatten_metrics: bool = True,
158
+ flatten_categories: bool = True,
159
+ add_overall_metric: bool = False
160
+ ) -> pd.DataFrame:
159
161
  """
160
162
  Convert the report to a pandas DataFrame.
161
163
  Args:
@@ -201,8 +203,8 @@ class Report:
201
203
  # multi-level aggregation for categories
202
204
  max_depth = df_categories[ReportKey.category_name].apply(len).max()
203
205
  for level in range(max_depth):
204
- df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply(
205
- lambda x: x[level] if len(x) > level else None)
206
+ df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[
207
+ ReportKey.category_name].apply(lambda x: x[level] if len(x) > level else None)
206
208
 
207
209
  df_categories.drop(columns=[ReportKey.category_name], inplace=True)
208
210
  return df_categories