evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,100 +1,168 @@
1
- import os
1
+ # flake8: noqa: E501
2
2
  import re
3
- from typing import Any, List
4
-
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import AnswerKeys, EvalType
7
- from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
8
-
9
- cur_path = os.path.dirname(os.path.abspath(__file__))
10
-
11
-
12
- @Benchmark.register(
13
- name='process_bench',
14
- pretty_name='ProcessBench',
15
- tags=['Mathematical', 'Reasoning'],
16
- description=
17
- 'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
18
- dataset_id='Qwen/ProcessBench',
19
- subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
20
- metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
21
- few_shot_num=0,
22
- train_split=None,
23
- eval_split='test',
24
- )
25
- class ProcessBenchAdapter(DataAdapter):
3
+ from typing import Any, Dict, List
26
4
 
27
- def __init__(self, **kwargs):
28
- super().__init__(**kwargs)
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.metric.scorer import AggScore, SampleScore
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
12
+ from evalscope.utils.logger import get_logger
29
13
 
30
- self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt'), encoding='utf-8').read()
14
+ logger = get_logger()
31
15
 
32
- # register metrics
33
- metric_registry.register(Metric(name='error_acc', object=mean))
34
- metric_registry.register(Metric(name='correct_acc', object=mean))
35
- metric_registry.register(Metric(name='simple_f1_score', object=simple_f1_score))
16
+ CRITIQUE_TEMPLATE = """CThe following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
36
17
 
37
- def load(self, **kwargs):
38
- # default load all levels
39
- kwargs['split_as_subset'] = True
40
- data_dict = super().load(**kwargs)
41
- return data_dict
18
+ [Math Problem]
42
19
 
43
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
20
+ {problem}
44
21
 
45
- problem = input_d['problem']
46
- steps = input_d['steps']
47
- tagged_response = ''
48
- for sdx, step in enumerate(steps):
49
- tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
50
- tagged_response = tagged_response.strip()
22
+ [Solution]
51
23
 
52
- full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
24
+ {tagged_response}
53
25
 
54
- return self.gen_prompt_data(full_prompt)
26
+ Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
55
27
 
56
- def get_gold_answer(self, input_d: dict) -> str:
57
- """
58
- Parse the raw input labels (gold).
59
- """
60
- return int(input_d['label'])
28
+ Please put your final answer (i.e., the index) in \boxed{{}}.
29
+ """
30
+
31
+
32
+ @register_benchmark(
33
+ BenchmarkMeta(
34
+ name='process_bench',
35
+ pretty_name='ProcessBench',
36
+ tags=[Tags.MATH, Tags.REASONING],
37
+ description=
38
+ 'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
39
+ dataset_id='Qwen/ProcessBench',
40
+ subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
41
+ metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
42
+ eval_split='test',
43
+ prompt_template=CRITIQUE_TEMPLATE
44
+ )
45
+ )
46
+ class ProcessBenchAdapter(DefaultDataAdapter):
47
+
48
+ def __init__(self, **kwargs):
49
+ super().__init__(**kwargs)
50
+ self.split_as_subset = True # Use split as subset
61
51
 
62
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
52
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
63
53
  """
64
- Parse the predicted result and extract proper answer.
54
+ Convert a data record to a Sample object.
55
+
56
+ Args:
57
+ record (Dict[str, Any]): Input data record.
58
+
59
+ Returns:
60
+ Sample: Sample object with input, target, and metadata.
65
61
  """
66
- pred = ProcessBenchAdapter.extract_answer(result)
62
+ problem = record['problem']
63
+ steps = record['steps']
64
+ tagged_response = ''
65
+ for sdx, step in enumerate(steps):
66
+ tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
67
+ tagged_response = tagged_response.strip()
68
+
69
+ return Sample(
70
+ input=problem,
71
+ target=str(record['label']),
72
+ metadata={
73
+ 'steps': steps,
74
+ 'tagged_response': tagged_response,
75
+ 'final_answer_correct': record['final_answer_correct']
76
+ }
77
+ )
78
+
79
+ def format_prompt_template(self, sample):
80
+ """Format the prompt template with problem and tagged response."""
81
+ problem = sample.input
82
+ tagged_response = sample.metadata['tagged_response']
83
+ return self.prompt_template.format(problem=problem, tagged_response=tagged_response)
84
+
85
+ def extract_answer(self, prediction: str, task_state: TaskState):
86
+ """Extract the answer from the model prediction."""
87
+ pred = self._extract_answer_from_text(prediction)
67
88
  try:
68
89
  pred = int(pred)
69
90
  except Exception:
70
91
  pred = None
71
92
  return pred
72
93
 
73
- def match(self, gold: int, pred: int) -> float:
74
- """
75
- Match the gold answer and the predicted answer.
76
- """
77
- return gold == pred
78
-
79
- def compute_metric(self, review_res_list: list, **kwargs) -> List[dict]:
80
- reviews_list = kwargs['reviews_list']
81
- error_data = []
82
- correct_data = []
83
- for res, raw in zip(review_res_list, reviews_list):
84
- if raw[AnswerKeys.RAW_INPUT]['label'] == -1:
85
- correct_data.append(res)
86
- else:
87
- error_data.append(res)
88
- data = {}
89
- if len(correct_data) != 0:
90
- data.update({'correct_acc': correct_data})
91
- if len(error_data) != 0:
92
- data.update({'error_acc': error_data})
93
- data.update({'simple_f1_score': (correct_data, error_data)})
94
- return super().compute_metric(data)
94
+ def match_score(
95
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
96
+ ) -> Score:
97
+ """Calculate evaluation scores by comparing prediction with reference."""
98
+ score = Score(
99
+ extracted_prediction=str(filtered_prediction) if filtered_prediction is not None else None,
100
+ prediction=original_prediction,
101
+ )
102
+
103
+ # Convert filtered_prediction to int if possible
104
+ try:
105
+ pred_int = int(filtered_prediction) if filtered_prediction is not None else None
106
+ except (ValueError, TypeError):
107
+ pred_int = None
108
+
109
+ # Calculate accuracy
110
+ reference = int(reference) if reference is not None else None
111
+ accuracy = 1.0 if reference == pred_int else 0.0
112
+
113
+ # Determine metric name based on label
114
+ if reference == -1:
115
+ metric_name = 'correct_acc'
116
+ else:
117
+ metric_name = 'error_acc'
118
+
119
+ score.value = {metric_name: accuracy}
120
+ score.main_score_name = metric_name
121
+
122
+ return score
123
+
124
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
125
+ """Aggregate scores to compute final metrics including F1 score."""
126
+ correct_scores = []
127
+ error_scores = []
128
+
129
+ for sample_score in sample_scores:
130
+ score = sample_score.score
131
+ if 'correct_acc' in score.value:
132
+ correct_scores.append(score.value['correct_acc'])
133
+ elif 'error_acc' in score.value:
134
+ error_scores.append(score.value['error_acc'])
135
+
136
+ agg_list = []
137
+
138
+ if correct_scores:
139
+ agg_list.append(
140
+ AggScore(
141
+ metric_name='correct_acc', score=sum(correct_scores) / len(correct_scores), num=len(correct_scores)
142
+ )
143
+ )
144
+
145
+ if error_scores:
146
+ agg_list.append(
147
+ AggScore(metric_name='error_acc', score=sum(error_scores) / len(error_scores), num=len(error_scores))
148
+ )
149
+
150
+ # Calculate simple F1 score
151
+ if correct_scores and error_scores:
152
+ from evalscope.metrics import simple_f1_score
153
+ agg_list.append(
154
+ AggScore(
155
+ metric_name='simple_f1_score',
156
+ score=simple_f1_score((correct_scores, error_scores)),
157
+ num=len(correct_scores) + len(error_scores)
158
+ )
159
+ )
160
+
161
+ return agg_list
95
162
 
96
163
  @staticmethod
97
- def extract_answer(solution_text: str):
164
+ def _extract_answer_from_text(solution_text: str):
165
+ """Extract answer from solution text using boxed pattern."""
98
166
  boxed_pattern = r'\\boxed\{([^}]*)\}'
99
167
  matches = re.findall(boxed_pattern, solution_text)
100
168
  if matches:
@@ -1,135 +1,49 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import os
4
-
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType, OutputType
7
- from evalscope.metrics import exact_match
8
- from evalscope.metrics.completion_parsers import ResponseParser
9
- from evalscope.utils.io_utils import jsonl_to_list
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
10
7
  from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
11
9
 
12
10
  # flake8: noqa
13
11
 
14
12
  logger = get_logger()
15
13
 
16
14
 
17
- @Benchmark.register(
18
- name='race',
19
- pretty_name='RACE',
20
- tags=['Reasoning', 'MCQ'],
21
- description=
22
- 'RACE is a benchmark for testing reading comprehension and reasoning abilities of neural models. It is constructed from Chinese middle and high school examinations.', # noqa: E501
23
- dataset_id='modelscope/race',
24
- model_adapter=OutputType.MULTIPLE_CHOICE,
25
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
26
- subset_list=['high', 'middle'],
27
- metric_list=['AverageAccuracy'],
28
- few_shot_num=3,
29
- train_split='train',
30
- eval_split='test',
15
+ @register_benchmark(
16
+ BenchmarkMeta(
17
+ name='race',
18
+ pretty_name='RACE',
19
+ tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
20
+ description=
21
+ 'RACE is a benchmark for testing reading comprehension and reasoning abilities of neural models. It is constructed from Chinese middle and high school examinations.', # noqa: E501
22
+ dataset_id='evalscope/race',
23
+ metric_list=['acc'],
24
+ subset_list=['high', 'middle'],
25
+ few_shot_num=3,
26
+ train_split='train',
27
+ eval_split='test',
28
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
29
+ )
31
30
  )
32
- class RACEAdapter(DataAdapter):
31
+ class RACEAdapter(MultiChoiceAdapter):
33
32
 
34
33
  def __init__(self, **kwargs):
35
- few_shot_num = kwargs.get('few_shot_num', 3)
36
- if few_shot_num > 3:
37
- logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
38
- kwargs['few_shot_num'] = 3
39
-
40
34
  super().__init__(**kwargs)
41
35
 
42
- self.choices = ['A', 'B', 'C', 'D']
43
-
44
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
45
- data_dict = {}
46
- for subset_name in subset_list:
47
- data_dict[subset_name] = {}
48
- for split in [self.train_split, self.eval_split]:
49
- if os.path.exists(dataset_name_or_path):
50
- file_path = os.path.join(dataset_name_or_path, subset_name, f'{split}.jsonl')
51
- else:
52
- file_path = os.path.join(work_dir, dataset_name_or_path, subset_name, f'{split}.jsonl')
53
- if os.path.exists(file_path):
54
- data_dict[subset_name][split] = jsonl_to_list(file_path)
55
-
56
- return data_dict
57
-
58
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
59
- """
60
- Generate model prompt from raw input, unify the prompt format for RACE benchmark.
61
-
62
- Args:
63
- input_d (dict): The raw input. A single data format of the RACE:
64
-
65
- {'example_id': 'high3680.txt',
66
- 'article': 'Astronauts on shorter shuttle missions often work very long days. Tasks are scheduled so tightly that break times are often used to finish the day's work. This type of schedule is far too demanding for long missions on the International Space Station(ISS). ISS crewmembers usually live in space for at least a quarter of a year. They work five days on and two days off to _ the normal way they do things on Earth as much as possible. Weekends give the crew valuable time to rest and do a few hours of housework. They can communicate with family and friends by email , internet phone and through private video conferences. While astronauts cannot go to a baseball game or a movie in orbit, there are many familiar activities that they can still enjoy . Before a mission, the family and friends of each ISS crewmember put together a collection of family photos, messages, videos and reading material for the astronauts to look at when they will be floating 370 kilometers above the Earth. During their mission, the crew also receives care packages with CDs, books, magazines, photos and letters . And as from early 2010, the internet became available on the ISS , giving astronauts the chance to do some "web surfing "in their personal time. Besides relaxing with these more common entertainments, astronauts can simply enjoy the experience of living in space. Many astronauts say that one of the most relaxing things to do in space is to look out the window and stare at the universe and the Earth's vast land mass and oceans.',
67
- 'answer': 'C',
68
- 'question': 'The passage mainly discusses how astronauts _ .',
69
- 'options': [
70
- "work for longer missions in space",
71
- "connect with people on the Earth",
72
- "spend their free time in space",
73
- "observe the Earth from space"]}
74
-
75
- Returns:
76
- {'data': [(context, continuation), ...]}
77
-
78
- """
79
- prompt = 'The following are multiple choice reading comprehension questions (with answers).\n\n'.format(
80
- self._format_subject(subset_name))
81
- few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
82
-
83
- context: str = '\n'.join(few_shot_prompts) + '\n'
84
- context += self._generate_prompt(input_d=input_d, include_answer=False)
85
- context = prompt + context
86
-
87
- full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
88
-
89
- return self.gen_prompt_data(full_prompt)
90
-
91
- def get_gold_answer(self, input_d: dict) -> str:
92
- # Get the gold choice
93
- return input_d.get('answer', '')
94
-
95
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
96
- """
97
- Parse the model output to get the answer. Could be the best choice index.
98
-
99
- Args:
100
- result: Predicted answer from the model. Usually a string for chat.
101
- raw_input_d: The raw input. Depending on the dataset.
102
- eval_type: The evaluation type. e.g. 'checkpoint' or 'service' or 'custom'.
103
-
104
- Returns:
105
- The parsed answer. Depending on the dataset. Usually a string for chat.
106
- """
107
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
108
- return result
109
- else:
110
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
111
-
112
- def match(self, gold: str, pred: str) -> float:
113
- return exact_match(gold=gold, pred=pred)
114
-
115
- def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
116
-
117
- input_choices: list = input_d['options']
118
-
119
- example: str = 'Article:\n{}\nQuestion:\n{}'.format(input_d['article'], input_d['question'])
120
- for j in range(len(self.choices)):
121
- example += '\n{}. {}'.format(self.choices[j], input_choices[j])
122
-
123
- example += '\nAnswer:'
124
- if include_answer:
125
- example += ' {}\n\n'.format(input_d['answer'])
36
+ if self.few_shot_num > 3:
37
+ logger.warning(f'few_shot_num <= 3 for RACE, but got {self.few_shot_num}. Use 3-shot by default.')
38
+ self.few_shot_num = 3
126
39
 
127
- return example
40
+ def record_to_sample(self, record) -> Sample:
41
+ # Format the article and question as context
42
+ context = f"Article:\n{record['article']}\nQuestion:\n{record['question']}"
128
43
 
129
- @classmethod
130
- def _format_subject(cls, subject):
131
- l = subject.split('_')
132
- s = ''
133
- for entry in l:
134
- s += ' ' + entry
135
- return s
44
+ return Sample(
45
+ input=context,
46
+ choices=record['options'],
47
+ target=record['answer'],
48
+ metadata={'example_id': record.get('example_id', 'unknown')},
49
+ )
@@ -1,13 +1,15 @@
1
+ import ast
1
2
  import re
2
- from collections import defaultdict
3
- from typing import Any, List
4
-
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
3
+ from typing import Any, Dict
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
7
11
  from evalscope.utils.logger import get_logger
8
12
 
9
- # flake8: noqa
10
-
11
13
  logger = get_logger()
12
14
 
13
15
  GRADER_TEMPLATE = """
@@ -76,7 +78,7 @@ Also note the following things:
76
78
  - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
77
79
 
78
80
 
79
- Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
81
+ Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT_ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
80
82
  ```
81
83
  Question: {question}
82
84
  Gold target: {target}
@@ -92,76 +94,76 @@ Just return the letters "A", "B", or "C", with no text around it.
92
94
  """.strip() # noqa: E501
93
95
 
94
96
 
95
- @Benchmark.register(
96
- name='simple_qa',
97
- pretty_name='SimpleQA',
98
- tags=['Knowledge', 'QA'],
99
- description=
100
- 'SimpleQA is a benchmark designed to evaluate the performance of language models on simple question-answering tasks. It includes a set of straightforward questions that require basic reasoning and understanding capabilities.', # noqa: E501
101
- dataset_id='AI-ModelScope/SimpleQA',
102
- metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
103
- few_shot_num=0,
104
- train_split=None,
105
- eval_split='test')
106
- class SimpleQAAdapter(DataAdapter):
97
+ @register_benchmark(
98
+ BenchmarkMeta(
99
+ name='simple_qa',
100
+ pretty_name='SimpleQA',
101
+ tags=[Tags.KNOWLEDGE, Tags.QA],
102
+ description=
103
+ 'SimpleQA is a benchmark designed to evaluate the performance of language models on simple question-answering tasks. It includes a set of straightforward questions that require basic reasoning and understanding capabilities.', # noqa: E501
104
+ dataset_id='AI-ModelScope/SimpleQA',
105
+ metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
106
+ few_shot_num=0,
107
+ train_split=None,
108
+ eval_split='test',
109
+ prompt_template='Answer the question:\n\n{question}'
110
+ )
111
+ )
112
+ class SimpleQAAdapter(DefaultDataAdapter):
107
113
 
108
114
  def __init__(self, *args, **kwargs):
109
115
  super().__init__(*args, **kwargs)
110
116
 
111
- # register metrics
112
- metric_registry.register(Metric(name='is_correct', object=mean))
113
- metric_registry.register(Metric(name='is_incorrect', object=mean))
114
- metric_registry.register(Metric(name='is_not_attempted', object=mean))
115
-
116
- # whether to use LLM as a judge
117
- self.llm_as_a_judge = True
118
-
119
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
120
- question = input_d['problem']
121
- return self.gen_prompt_data(question)
122
-
123
- def get_gold_answer(self, input_d: dict) -> str:
124
- return input_d['answer']
125
-
126
- def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
127
- return result.strip()
128
-
129
- def match(self, gold: str, pred: str) -> float:
130
- # simple match
131
- logger.warning(f'Please use LLMJudge to match the result for {self.name}')
132
- is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
133
- is_incorrect = not is_correct
134
- is_not_attempted = 0
135
- return {
136
- 'is_correct': is_correct,
137
- 'is_incorrect': is_incorrect,
138
- 'is_not_attempted': is_not_attempted,
139
- }
117
+ self._use_llm_judge = True # Use LLM as a judge by default
118
+
119
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
120
+ """
121
+ Convert a data record to a Sample object.
122
+
123
+ Args:
124
+ record (Dict[str, Any]): Input data record.
140
125
 
141
- def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
142
- raw_input = kwargs.get('raw_input', None)
143
- question = raw_input['problem']
144
- # get grading response
145
- prompt = GRADER_TEMPLATE.format(question=question, target=gold, predicted_answer=pred)
146
- grading_response = judge(prompt)
126
+ Returns:
127
+ Sample: Sample object with input, target, and metadata.
128
+ """
129
+ question = record['problem']
130
+ answer = record['answer']
131
+ metadata = record.get('metadata')
132
+
133
+ return Sample(input=question, target=answer, metadata=ast.literal_eval(metadata))
134
+
135
+ def llm_match_score(
136
+ self,
137
+ original_prediction: str,
138
+ filtered_prediction: str,
139
+ reference: str,
140
+ task_state: TaskState,
141
+ ) -> Score:
142
+ score = Score(
143
+ extracted_prediction=filtered_prediction,
144
+ prediction=original_prediction,
145
+ )
146
+
147
+ question = task_state.input_text
148
+
149
+ # Request judge and obtain score
150
+ prompt = GRADER_TEMPLATE.format(question=question, target=reference, predicted_answer=filtered_prediction)
151
+ judge_response = self.llm_judge.judge(prompt)
147
152
  # parse grading response
148
- match = re.search(r'(A|B|C)', grading_response)
153
+ match = re.search(r'(A|B|C)', judge_response)
149
154
  res = match.group(0) if match else 'C'
150
- return {
155
+
156
+ # Set score based on the match result
157
+ score.value = {
151
158
  'is_correct': 1 if res == 'A' else 0,
152
159
  'is_incorrect': 1 if res == 'B' else 0,
153
160
  'is_not_attempted': 1 if res == 'C' else 0,
154
- 'judge_response': grading_response,
155
161
  }
156
-
157
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
158
- """
159
- compute weighted mean of the bleu score of all samples
160
-
161
- Args:
162
- review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
163
- """
164
- # zip dict answers
165
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
166
-
167
- return super().compute_metric(res_dict, **kwargs)
162
+ score.explanation = f'LLM judge: {judge_response}'
163
+ score.metadata = {
164
+ 'source': 'llm_judge',
165
+ 'judge_strategy': self.judge_strategy,
166
+ 'model': self.llm_judge.model_id
167
+ }
168
+ score.main_score_name = 'is_correct'
169
+ return score