evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,483 +1,339 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ """
3
+ Default evaluator implementation for running benchmark evaluations.
4
+
5
+ This module provides the DefaultEvaluator class which orchestrates the entire
6
+ evaluation process including data loading, model inference, metric calculation,
7
+ and report generation.
8
+ """
2
9
 
3
- import json
4
10
  import os
5
- import time
6
- from collections import OrderedDict, defaultdict
11
+ from collections import defaultdict
7
12
  from concurrent.futures import ThreadPoolExecutor, as_completed
8
- from copy import deepcopy
9
13
  from tqdm import tqdm
10
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
14
+ from typing import TYPE_CHECKING, Dict, List, Tuple, Union
11
15
 
12
- from evalscope.benchmarks import DataAdapter
13
- from evalscope.config import TaskConfig
14
- from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
16
+ from evalscope.api.dataset import Dataset, DatasetDict, Sample
17
+ from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
18
+ from evalscope.api.metric import AggScore, SampleScore
15
19
  from evalscope.report import Report, gen_table
16
- from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
17
- from evalscope.utils.logger import get_logger
18
- from evalscope.utils.model_utils import dict_torch_dtype_to_str
19
20
 
20
21
  if TYPE_CHECKING:
21
- from evalscope.models import BaseModelAdapter
22
+ from evalscope.api.benchmark import DataAdapter
23
+ from evalscope.api.model import Model
24
+ from evalscope.config import TaskConfig
25
+ from evalscope.utils.io_utils import OutputsStructure
26
+
27
+ from evalscope.utils.logger import get_logger
22
28
 
23
29
  logger = get_logger()
24
30
 
25
31
 
26
- class Evaluator(object):
32
+ class DefaultEvaluator(Evaluator):
27
33
  """
28
- The evaluator for model on datasets.
34
+ Default Evaluator for running evaluations on benchmarks.
35
+
36
+ This evaluator handles the complete evaluation pipeline:
37
+ 1. Loading datasets from benchmarks
38
+ 2. Running model inference on samples
39
+ 3. Calculating evaluation metrics
40
+ 4. Generating and saving reports
41
+ 5. Managing caching for predictions and reviews
29
42
 
30
43
  Args:
31
- dataset_name_or_path: str, the dataset name or path.
32
- if the dataset is a local path, e.g. /path/to/your_dataset_name,
33
- then the task name will be the basename of the path, which is `your_dataset_name`.
34
- data_adapter: DataAdapter, the data adapter for the dataset.
35
- model_adapter: BaseModelAdapter, the model adapter for the model.
36
- outputs: OutputsStructure, the outputs dir. Default: None
37
- task_cfg: TaskConfig, the overall task config. Default: None
38
- **kwargs: kwargs.
44
+ benchmark: The data adapter for loading and processing data.
45
+ model: The model to be evaluated.
46
+ outputs: The output structure for saving evaluation results.
47
+ task_config: The task configuration.
39
48
  """
40
49
 
41
- def __init__(self,
42
- data_adapter: DataAdapter,
43
- model_adapter: 'BaseModelAdapter',
44
- outputs: OutputsStructure = None,
45
- task_cfg: TaskConfig = None,
46
- **kwargs):
47
-
48
- self.dataset_name = data_adapter.name
49
- self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
50
- self.model_name = task_cfg.model_id
51
-
52
- self.data_adapter = data_adapter
53
- self.model_adapter = model_adapter
54
- self.model_cfg = model_adapter.model_cfg
55
- self.eval_type = task_cfg.eval_type
56
- self.dataset_hub = task_cfg.dataset_hub
57
- self.stage = task_cfg.stage
58
- self.use_cache = task_cfg.use_cache
59
- self.task_cfg = task_cfg
60
- # Deal with the output paths
61
- self.outputs_structure = outputs
62
- self.kwargs = kwargs
63
-
64
- self._init_judge()
65
-
66
- def _init_judge(self):
67
- if self.task_cfg.judge_strategy == JudgeStrategy.RULE:
68
- self.judge = None
69
- else:
70
- from evalscope.metrics import LLMJudge
71
- self.judge = LLMJudge(**self.task_cfg.judge_model_args)
72
-
73
- def load_dataset(self):
74
- dataset = self.data_adapter.load(
75
- work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
76
-
77
- # Get prompts from dataset
78
- prompts = self.data_adapter.gen_prompts(data_dict=dataset)
79
-
80
- # Limit and index prompts
81
- limited_prompts = defaultdict(list)
82
- for subset_name, prompts_list in prompts.items():
83
- # If limit is None, use all prompts
84
- if self.task_cfg.limit is None:
85
- limit = len(prompts_list)
86
- else:
87
- if isinstance(self.task_cfg.limit, int):
88
- limit = self.task_cfg.limit
89
- elif isinstance(self.task_cfg.limit, float):
90
- limit = int(len(prompts_list) * self.task_cfg.limit)
91
- # Limit the number of prompts
92
- for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
93
- prompt[AnswerKeys.INDEX] = index
94
- limited_prompts[subset_name].append(prompt)
95
-
96
- return limited_prompts
97
-
98
- def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
99
- model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
100
- input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
101
- infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
102
- return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
103
-
104
- def _process_answer(self, answer_d, input_d, subset_name, answer_id):
105
- answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
106
- answer_d[AnswerKeys.ANSWER_ID] = answer_id
107
- answer_d[AnswerKeys.SUBSET_NAME] = subset_name
108
- answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
109
- answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
110
- return answer_d
111
-
112
- def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
113
- try:
114
- # get answer from model
115
- answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
116
- except Exception as e:
117
- logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
118
- # if ignore_errors is True, continue to next input
119
- if self.task_cfg.ignore_errors:
120
- logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
121
- return []
122
- else:
123
- raise e
124
- # process answer
125
- answers_list = []
126
- for answer_d, input_prompt in zip(answer_ds, input_prompts):
127
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
128
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
129
- answers_list.append(processed_answer)
130
- return answers_list
131
-
132
- @staticmethod
133
- def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
134
- # Filter prompts that have been answered
135
- answers_list = []
136
- if not use_cache or not os.path.exists(pred_file_path):
137
- return answers_list, prompts_list
138
-
139
- def get_answered_indices(answers_list: List[Dict]) -> List[int]:
140
- indices = [answer.get(AnswerKeys.INDEX) for answer in answers_list]
141
-
142
- if all(index is None for index in indices):
143
- return list(range(len(answers_list)))
144
-
145
- return [index for index in indices if index is not None]
146
-
147
- answers_list = jsonl_to_list(pred_file_path)
148
- answered_indices = set(get_answered_indices(answers_list))
149
- logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
150
-
151
- prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
152
- return answers_list, prompts
153
-
154
- def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
50
+ def __init__(
51
+ self,
52
+ benchmark: 'DataAdapter',
53
+ model: 'Model',
54
+ outputs: 'OutputsStructure',
55
+ task_config: 'TaskConfig',
56
+ ):
57
+ # Store core components needed for evaluation
58
+ self.benchmark = benchmark
59
+ self.model = model
60
+ self.outputs = outputs
61
+ self.task_config = task_config
62
+
63
+ # Extract frequently used identifiers
64
+ self.benchmark_name = benchmark.name
65
+ """Name of the benchmark being evaluated."""
66
+
67
+ self.model_name = task_config.model_id
68
+ """ID of the model being evaluated."""
69
+
70
+ self.use_cache = task_config.use_cache
71
+ """Whether to use cache for predictions."""
72
+
73
+ # Initialize cache manager for storing and retrieving cached results
74
+ self.cache_manager = CacheManager(
75
+ outputs=outputs,
76
+ model_name=self.model_name,
77
+ benchmark_name=self.benchmark_name,
78
+ )
79
+
80
+ def eval(self) -> Report:
155
81
  """
156
- Get answers from model inference.
157
- It is required to rewrite this method to support your own evaluator.
82
+ Run the complete evaluation process.
158
83
 
159
- Args:
160
- subset_name: subset name for benchmark.
161
- prompts_list: prompts list.
162
- infer_cfg: model inference config.
163
- Attributes:
164
- do_sample: bool, whether to use sampling.
165
- top_k: int, the number of highest probability vocabulary tokens to keep for top-k-filtering.
166
- top_p: float, if set to float < 1, only the most probable tokens with probabilities to add.
167
- temperature: float, the value used to module the next token probabilities.
168
- num_beams: int, number of beams for beam search. 1 means no beam search.
169
- max_length: int, the max length of the sequence to be generated.
170
- max_new_tokens: int, the max number of new tokens to be generated.
171
- repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
172
- **kwargs: kwargs.
173
-
174
- Returns: The list of answers.
84
+ This is the main entry point that orchestrates the entire evaluation:
85
+ 1. Load dataset from benchmark
86
+ 2. Evaluate each subset independently
87
+ 3. Aggregate scores across subsets
88
+ 4. Generate final evaluation report
89
+
90
+ Returns:
91
+ Report: The complete evaluation report containing all metrics and results.
175
92
  """
176
- assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
177
- assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
178
- assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
179
-
180
- pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
181
- pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
182
- os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
183
-
184
- answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
185
-
186
- eval_batch_size = self.task_cfg.eval_batch_size
187
- if self.task_cfg.eval_type == EvalType.SERVICE:
188
- with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
189
- with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
190
- futures = []
191
- for input_prompt in prompts_list:
192
- futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
193
- for future in as_completed(futures):
194
- answer_ds: List[dict] = future.result()
195
- answers_list.extend(answer_ds)
196
- dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
197
- pbar.update(len(answer_ds))
198
- else:
199
- batch_prompts_list = [
200
- prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
201
- ]
202
- with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
203
- for batch_prompts in batch_prompts_list:
204
- answer_ds: List[dict] = self._get_answer(
205
- input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
206
- answers_list.extend(answer_ds)
207
- dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
208
- pbar.update(len(batch_prompts))
209
-
210
- logger.info(f'Dump predictions to {pred_file_path}.')
211
- return answers_list
212
-
213
- def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
214
-
215
- if reviewer_spec is None:
216
- reviewer_spec = {}
217
-
218
- review_res = deepcopy(answer_d)
219
- if AnswerKeys.CHOICES not in review_res:
220
- review_res[AnswerKeys.CHOICES] = []
221
- review_res[ReviewKeys.REVIEWED] = True
222
- review_res[ReviewKeys.REVIEW_ID] = None
223
- review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
224
- review_res[ReviewKeys.REVIEW_TIME] = time.time()
225
- logger.warning(f'No choices found for answer dict: {review_res}')
226
- return review_res
227
-
228
- rev_choices = []
229
- for choice in review_res[AnswerKeys.CHOICES]:
230
- raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
231
- answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
232
- gold_content = self.data_adapter.get_gold_answer(raw_input_d)
233
-
234
- # Get review result based on judge strategy
235
- use_llm = (
236
- self.task_cfg.judge_strategy == JudgeStrategy.LLM
237
- or (self.task_cfg.judge_strategy == JudgeStrategy.AUTO and self.data_adapter.llm_as_a_judge))
238
-
239
- if use_llm:
240
- # Use LLM as judge
241
- assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
242
- pred_content = self.data_adapter.llm_parse_pred_result(
243
- result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
244
- review_result = self.data_adapter.llm_match(
245
- gold_content, pred_content, self.judge, raw_input=raw_input_d)
246
- else:
247
- # Use rule-based judging
248
- pred_content = self.data_adapter.parse_pred_result(
249
- result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
250
- review_result = self.data_adapter.match(gold_content, pred_content)
251
-
252
- # For LLM_RECALL strategy, use LLM to re-judge if rule-based result is not good
253
- if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
254
- and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
255
- assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}' # noqa: E501
256
- pred_content = self.data_adapter.llm_parse_pred_result(
257
- result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
258
- review_result = self.data_adapter.llm_match(
259
- gold_content, pred_content, self.judge, raw_input=raw_input_d)
260
-
261
- choice[ReviewKeys.REVIEW] = {
262
- ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
263
- ReviewKeys.PRED: pred_content,
264
- ReviewKeys.RESULT: review_result
265
- }
266
- rev_choices.append(choice)
267
-
268
- review_res[AnswerKeys.CHOICES] = rev_choices
269
- review_res[ReviewKeys.REVIEWED] = True
270
- review_res[ReviewKeys.REVIEW_ID] = review_id
271
- review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
272
- review_res[ReviewKeys.REVIEW_TIME] = time.time()
273
-
274
- return review_res
275
-
276
- def _generate_review_id(self, answer_d):
277
- # Gen review_id (concat: answer_id + reviewer_spec)
278
- answer_id = answer_d[AnswerKeys.ANSWER_ID]
279
- reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
280
- reviewer_spec_str = json.dumps(
281
- OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
282
- review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
283
- return review_id, reviewer_spec
284
-
285
- def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
93
+ # Load the dataset and evaluate each subset
94
+ dataset_dict = self.benchmark.load_dataset()
95
+ agg_score_dict = defaultdict(list)
96
+
97
+ # Process each subset (e.g., test, validation) independently
98
+ for subset, dataset in dataset_dict.items():
99
+ if len(dataset) == 0:
100
+ logger.info(f'No samples found in subset: {subset}, skipping.')
101
+ continue
102
+ subset_score = self.evaluate_subset(subset, dataset)
103
+ agg_score_dict[subset] = subset_score
104
+
105
+ # Generate the report based on aggregated scores
106
+ report = self.get_report(agg_score_dict)
107
+ return report
108
+
109
+ def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
286
110
  """
287
- Get reviews from answers.
288
- It is required to rewrite this method to support your own evaluator.
111
+ Evaluate a single subset of the dataset.
112
+
113
+ This method processes one subset through the complete evaluation pipeline:
114
+ 1. Get model predictions for all samples
115
+ 2. Calculate evaluation metrics for predictions
116
+ 3. Aggregate individual sample scores
289
117
 
290
118
  Args:
291
- subset_name: subset name of benchmark
292
- answers_list: inference results list.
293
- **kwargs: kwargs.
119
+ subset: Name of the subset being evaluated (e.g., 'test', 'validation').
120
+ dataset: The dataset subset containing samples to evaluate.
294
121
 
295
- Returns: reviews list.
122
+ Returns:
123
+ List[AggScore]: Aggregated scores for this subset.
296
124
  """
297
- reviews_list = []
298
-
299
- review_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
300
- review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
301
- os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
302
-
303
- # Load existing reviews if using cache
304
- existing_reviews = {}
305
- if self.use_cache and os.path.exists(review_file_path):
306
- with open(review_file_path, 'r') as f:
307
- for line in f:
308
- review = json.loads(line.strip())
309
- existing_reviews[review['index']] = review
310
- logger.info(f'Reusing review result from {review_file_path}, got {len(existing_reviews)} reviews.')
311
-
312
- def process_single_review(answer_d):
313
- # Check if review already exists in cache
314
- if self.use_cache and answer_d['index'] in existing_reviews:
315
- return existing_reviews[answer_d['index']]
316
-
317
- review_id, reviewer_spec = self._generate_review_id(answer_d)
318
- # Get review
319
- review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
320
- logger.debug(review_d)
321
- return review_d
322
-
323
- with ThreadPoolExecutor(max_workers=self.task_cfg.judge_worker_num) as executor:
324
- # Submit all tasks and get futures
325
- futures = [executor.submit(process_single_review, answer_d) for answer_d in answers_list]
326
-
327
- # Process completed futures with progress bar
328
- for future in tqdm(as_completed(futures), total=len(futures), desc=f'Reviewing({subset_name}): '):
329
- review_d = future.result()
330
- reviews_list.append(review_d)
331
- # Dump new reviews only if not using cache or review is new
332
- if not self.use_cache or review_d['index'] not in existing_reviews:
333
- dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
334
-
335
- return reviews_list
336
-
337
- def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
125
+ # Get model predictions for all samples in the subset
126
+ task_states = self.get_answers(subset, dataset)
127
+
128
+ # Calculate evaluation metrics for each prediction
129
+ sample_scores = self.get_reviews(subset, task_states)
130
+
131
+ # Aggregate individual sample scores into subset-level metrics
132
+ agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
133
+ return agg_scores
134
+
135
+ def get_answers(self, subset: str, dataset: Dataset) -> List[TaskState]:
338
136
  """
339
- To compute metrics from reviews_list for each subset.
340
- It is required to rewrite this method to support your own evaluator.
137
+ Get model predictions for all samples in the dataset subset.
138
+
139
+ This method handles:
140
+ 1. Loading cached predictions if available and caching is enabled
141
+ 2. Running model inference on remaining samples in parallel
142
+ 3. Saving new predictions to cache
341
143
 
342
144
  Args:
343
- reviews_list: reviews list.
145
+ subset: Name of the subset being processed.
146
+ dataset: The dataset subset containing samples for prediction.
344
147
 
345
148
  Returns:
346
- The metric result. Depends on the metric function in data_adapter.
149
+ List[TaskState]: Task states containing model predictions for each sample.
347
150
  """
348
- # Get max choices
349
- choices_lengths = [
350
- len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d.get(ReviewKeys.REVIEWED)
351
- ]
352
- if choices_lengths:
353
- max_choices = max(choices_lengths)
151
+ # Initialize task state list and filter cached predictions if caching is enabled
152
+ if self.use_cache:
153
+ task_state_list, dataset = self.cache_manager.filter_prediction_cache(subset, dataset)
354
154
  else:
355
- max_choices = 0
356
-
357
- # Get review result
358
- review_res_list = []
359
- for review_d in reviews_list:
360
- if not review_d[ReviewKeys.REVIEWED]:
361
- logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
362
- continue
155
+ task_state_list = []
363
156
 
364
- if len(review_d[AnswerKeys.CHOICES]) == 0:
365
- logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
366
- continue
367
- elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
368
- review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
369
- else:
370
- review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
371
- if len(review_d[AnswerKeys.CHOICES]) < max_choices:
372
- logger.warning(
373
- f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
374
- f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
157
+ # Get output directory for storing model predictions
158
+ model_prediction_dir = os.path.dirname(self.cache_manager.get_prediction_cache_path(subset))
375
159
 
376
- review_res_list.append(review_res)
160
+ # Convert dataset to list for parallel processing
161
+ dataset_list = list(dataset)
377
162
 
378
- metric_score: List[dict] = self.data_adapter.compute_metric(
379
- review_res_list=review_res_list, reviews_list=reviews_list)
163
+ if not dataset_list:
164
+ return task_state_list
380
165
 
381
- return metric_score
166
+ # Process samples in parallel using ThreadPoolExecutor
167
+ with ThreadPoolExecutor(max_workers=min(len(dataset_list), self.task_config.eval_batch_size)) as executor:
168
+ # Submit all prediction tasks
169
+ future_to_sample = {
170
+ executor.submit(self._predict_sample, sample, model_prediction_dir): sample
171
+ for sample in dataset_list
172
+ }
382
173
 
383
- def dump_report(self, reviews_score_all: List[dict]):
174
+ # Process completed tasks with progress bar
175
+ with tqdm(total=len(dataset_list), desc=f'Predicting[{self.benchmark_name}@{subset}]: ') as pbar:
176
+ for future in as_completed(future_to_sample):
177
+ sample = future_to_sample[future]
178
+ try:
179
+ task_state = future.result()
180
+ task_state_list.append(task_state)
181
+
182
+ # Save the prediction result to cache for future use
183
+ model_result = self.cache_manager.save_prediction_cache(
184
+ subset, task_state, self.benchmark.save_metadata
185
+ )
186
+ logger.debug(f'Model result: \n{model_result.pretty_print()}')
187
+
188
+ except Exception as exc:
189
+ logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
190
+ if self.task_config.ignore_errors:
191
+ logger.warning('Error ignored, continuing with next sample.')
192
+ else:
193
+ raise exc
194
+ finally:
195
+ pbar.update(1)
196
+
197
+ return task_state_list
198
+
199
+ def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
384
200
  """
385
- Get report for total reviews of specific dataset.
386
- It is required to rewrite this method to support your own evaluator.
201
+ Helper method to predict a single sample.
387
202
 
388
203
  Args:
389
- reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
204
+ sample: The sample to predict.
205
+ model_prediction_dir: Directory for storing model predictions.
390
206
 
391
- Returns: None
207
+ Returns:
208
+ TaskState: The task state containing the prediction result.
392
209
  """
393
- report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
394
- os.makedirs(report_path, exist_ok=True)
395
- # Get report map
396
- report_map: Report = self.data_adapter.gen_report(
397
- subset_score_map=reviews_score_all, model_name=self.model_name)
398
-
399
- # Make table
400
- try:
401
- report_table = gen_table(report_list=[report_map], add_overall_metric=True)
402
- logger.info(f'\n{self.dataset_name_or_path} report table:'
403
- f'\n{report_table} \n')
404
- except Exception:
405
- logger.error('Failed to generate report table.')
406
-
407
- # Make report analysis
408
- if self.task_cfg.analysis_report:
409
- logger.info('Generating report analysis, please wait ...')
410
- analysis = report_map.generate_analysis(self.task_cfg.judge_model_args)
411
- logger.info('Report analysis:\n%s', analysis)
412
- else:
413
- logger.info('Skipping report analysis (`analysis_report=False`).')
210
+ logger.debug(f'\n{sample.pretty_print()}')
414
211
 
415
- # Dump report
416
- report_file = os.path.join(report_path, f'{self.dataset_name}.json')
417
- report_map.to_json(report_file)
418
- logger.info(f'Dump report to: {report_file} \n')
212
+ # Run model inference on the current sample
213
+ task_state = self.benchmark.run_inference(model=self.model, sample=sample, output_dir=model_prediction_dir)
214
+ return task_state
419
215
 
420
- # Post process report
421
- try:
422
- self.data_adapter.post_process_report(report_map, report_path=report_path)
423
- except Exception as e:
424
- logger.error(f'Failed to post process report: {e}')
425
-
426
- return report_map
427
-
428
- def eval(self, **kwargs) -> dict:
216
+ def get_reviews(self, subset: str, task_states: List[TaskState]) -> List[SampleScore]:
429
217
  """
430
- Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
431
- It is required to rewrite this method to support your own evaluator.
218
+ Calculate evaluation metrics for model predictions.
432
219
 
433
- The evaluation process is as follows:
434
- 1. Get the input samples from the dataset (benchmarks on the ModelScope or HuggingFace).
435
- 2. Get the input prompts from dataset with specific data adapter.
436
- 3. Get answers with model inference.
437
- 4. Get reviews with metric function (or reviewers).
438
- 5. Generate report from review results.
220
+ This method handles:
221
+ 1. Loading cached review results if available and caching is enabled
222
+ 2. Computing metrics for remaining task states in parallel
223
+ 3. Saving new review results to cache
439
224
 
440
225
  Args:
441
- infer_cfg: The config for model inference.
226
+ subset: Name of the subset being reviewed.
227
+ task_states: List of task states containing model predictions.
442
228
 
443
229
  Returns:
444
- Dict of results. Depends on the stage of evaluation.
230
+ List[SampleScore]: Evaluation scores for each sample.
231
+ """
232
+ # Initialize sample score list and filter cached reviews if caching is enabled
233
+ if self.use_cache and not self.task_config.rerun_review:
234
+ sample_score_list, task_states = self.cache_manager.filter_review_cache(subset, task_states)
235
+ else:
236
+ # Init a clean sample score list
237
+ sample_score_list = []
238
+ self.cache_manager.delete_review_cache(subset)
239
+
240
+ if not task_states:
241
+ return sample_score_list
242
+
243
+ # Process task states in parallel using ThreadPoolExecutor
244
+ with ThreadPoolExecutor(max_workers=min(len(task_states), self.task_config.judge_worker_num)) as executor:
245
+ # Submit all review tasks
246
+ future_to_task_state = {
247
+ executor.submit(self._review_task_state, task_state): task_state
248
+ for task_state in task_states
249
+ }
445
250
 
446
- stage == 'all': return the report_map
447
- stage == 'infer': return the answers_map
448
- stage == 'review': return the reviews_map
251
+ # Process completed tasks with progress bar
252
+ with tqdm(total=len(task_states), desc=f'Reviewing[{self.benchmark_name}@{subset}]: ') as pbar:
253
+ for future in as_completed(future_to_task_state):
254
+ task_state = future_to_task_state[future]
255
+ try:
256
+ sample_score = future.result()
257
+ sample_score_list.append(sample_score)
258
+
259
+ # Save the review result to cache for future use
260
+ review_result = self.cache_manager.save_review_cache(
261
+ subset=subset,
262
+ task_state=task_state,
263
+ sample_score=sample_score,
264
+ save_metadata=self.benchmark.save_metadata
265
+ )
266
+ logger.debug(f'Review result: \n{review_result.pretty_print()}')
267
+
268
+ except Exception as exc:
269
+ logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}')
270
+ if self.task_config.ignore_errors:
271
+ logger.warning('Error ignored, continuing with next sample.')
272
+ else:
273
+ raise exc
274
+ finally:
275
+ pbar.update(1)
276
+
277
+ return sample_score_list
278
+
279
+ def _review_task_state(self, task_state: TaskState) -> SampleScore:
449
280
  """
281
+ Helper method to review a single task state.
450
282
 
451
- logger.info(f'Start evaluating on dataset {self.dataset_name_or_path}')
283
+ Args:
284
+ task_state: The task state to review.
452
285
 
453
- reviews_score_all = {} # {subset_name: (score, num)}
454
- stage_answers_dict = {}
455
- stage_reviews_dict = {}
286
+ Returns:
287
+ SampleScore: The evaluation score for the task state.
288
+ """
289
+ # Compute evaluation metrics using the benchmark's metric calculation
290
+ sample_score = self.benchmark.calculate_metrics(task_state=task_state)
291
+ return sample_score
456
292
 
457
- prompts = self.load_dataset()
458
- for subset_name, prompts_list in prompts.items():
293
+ def get_report(self, agg_score_dict: Dict[str, List[AggScore]]) -> Report:
294
+ """
295
+ Generate a comprehensive evaluation report from aggregated scores.
459
296
 
460
- answers_list: list = self.get_answers(
461
- subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
462
- if self.stage == EvalStage.INFER:
463
- stage_answers_dict[subset_name] = answers_list
464
- continue
297
+ This method handles:
298
+ 1. Creating the evaluation report from scores
299
+ 2. Generating and displaying a summary table
300
+ 3. Optionally generating detailed analysis
301
+ 4. Saving the report to file
465
302
 
466
- reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
303
+ Args:
304
+ agg_score_dict: Dictionary mapping subset names to their aggregated scores.
467
305
 
468
- metric_res = self.compute_metrics(reviews_list=reviews_list)
469
- reviews_score_all[subset_name] = metric_res
470
- stage_reviews_dict[subset_name] = reviews_list
306
+ Returns:
307
+ Report: The complete evaluation report.
308
+ """
309
+ assert agg_score_dict, 'No scores to generate report from.'
471
310
 
472
- if self.stage == EvalStage.INFER:
473
- return stage_answers_dict
311
+ # Get paths for saving the report
312
+ report_path = self.cache_manager.get_report_path()
313
+ report_file = self.cache_manager.get_report_file()
474
314
 
475
- if self.stage == EvalStage.REVIEW:
476
- return stage_reviews_dict
315
+ # Generate the main evaluation report using benchmark-specific logic
316
+ report = self.benchmark.generate_report(
317
+ scores=agg_score_dict, model_name=self.model_name, output_dir=report_path
318
+ )
477
319
 
478
- # Generate report
479
- report_map = self.dump_report(reviews_score_all)
320
+ # Generate and display a summary table of results
321
+ try:
322
+ report_table = gen_table(report_list=[report], add_overall_metric=True)
323
+ logger.info(f'\n{self.benchmark_name} report table:'
324
+ f'\n{report_table} \n')
325
+ except Exception:
326
+ logger.error('Failed to generate report table.')
480
327
 
481
- logger.info(f'Evaluation finished on {self.dataset_name_or_path}')
328
+ # Generate detailed analysis if requested in configuration
329
+ if self.task_config.analysis_report:
330
+ logger.info('Generating report analysis, please wait ...')
331
+ analysis = report.generate_analysis(self.task_config.judge_model_args)
332
+ logger.info(f'Report analysis:\n{analysis}')
333
+ else:
334
+ logger.info('Skipping report analysis (`analysis_report=False`).')
482
335
 
483
- return report_map
336
+ # Save the complete report to file
337
+ report.to_json(report_file)
338
+ logger.info(f'Dump report to: {report_file} \n')
339
+ return report