evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,77 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import csv
3
- import os
4
2
 
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType, OutputType
7
- from evalscope.metrics import exact_match
8
- from evalscope.metrics.completion_parsers import ResponseParser
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
9
7
  from evalscope.utils.logger import get_logger
10
-
11
- # flake8: noqa
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
12
9
 
13
10
  logger = get_logger()
14
11
 
15
- SUBSET_LIST = [
16
- 'high_school_european_history',
17
- 'business_ethics',
18
- 'clinical_knowledge',
19
- 'medical_genetics',
20
- 'high_school_us_history',
21
- 'high_school_physics',
22
- 'high_school_world_history',
23
- 'virology',
24
- 'high_school_microeconomics',
25
- 'econometrics',
26
- 'college_computer_science',
27
- 'high_school_biology',
28
- 'abstract_algebra',
29
- 'professional_accounting',
30
- 'philosophy',
31
- 'professional_medicine',
32
- 'nutrition',
33
- 'global_facts',
34
- 'machine_learning',
35
- 'security_studies',
36
- 'public_relations',
37
- 'professional_psychology',
38
- 'prehistory',
39
- 'anatomy',
40
- 'human_sexuality',
41
- 'college_medicine',
42
- 'high_school_government_and_politics',
43
- 'college_chemistry',
44
- 'logical_fallacies',
45
- 'high_school_geography',
46
- 'elementary_mathematics',
47
- 'human_aging',
48
- 'college_mathematics',
49
- 'high_school_psychology',
50
- 'formal_logic',
51
- 'high_school_statistics',
52
- 'international_law',
53
- 'high_school_mathematics',
54
- 'high_school_computer_science',
55
- 'conceptual_physics',
56
- 'miscellaneous',
57
- 'high_school_chemistry',
58
- 'marketing',
59
- 'professional_law',
60
- 'management',
61
- 'college_physics',
62
- 'jurisprudence',
63
- 'world_religions',
64
- 'sociology',
65
- 'us_foreign_policy',
66
- 'high_school_macroeconomics',
67
- 'computer_security',
68
- 'moral_scenarios',
69
- 'moral_disputes',
70
- 'electrical_engineering',
71
- 'astronomy',
72
- 'college_biology',
73
- ]
74
-
75
12
  SUBJECT_MAPPING = {
76
13
  'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
77
14
  'anatomy': ['Anatomy', 'health', 'Other'],
@@ -133,148 +70,38 @@ SUBJECT_MAPPING = {
133
70
  }
134
71
 
135
72
 
136
- @Benchmark.register(
137
- name='mmlu',
138
- pretty_name='MMLU',
139
- tags=['Knowledge', 'MCQ'],
140
- description=
141
- "The MMLU (Massive Multitask Language Understanding) benchmark is a comprehensive evaluation suite designed to assess the performance of language models across a wide range of subjects and tasks. It includes multiple-choice questions from various domains, such as history, science, mathematics, and more, providing a robust measure of a model's understanding and reasoning capabilities.", # noqa: E501
142
- dataset_id='modelscope/mmlu',
143
- model_adapter=OutputType.GENERATION,
144
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
145
- subset_list=SUBSET_LIST,
146
- metric_list=['AverageAccuracy'],
147
- few_shot_num=0,
148
- train_split='train',
149
- eval_split='test',
150
- prompt_template=
151
- """Answer the following multiple choice question about {subset_name}. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{query}""", # noqa: E501
73
+ @register_benchmark(
74
+ BenchmarkMeta(
75
+ name='mmlu',
76
+ pretty_name='MMLU',
77
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
78
+ description=
79
+ "The MMLU (Massive Multitask Language Understanding) benchmark is a comprehensive evaluation suite designed to assess the performance of language models across a wide range of subjects and tasks. It includes multiple-choice questions from various domains, such as history, science, mathematics, and more, providing a robust measure of a model's understanding and reasoning capabilities.", # noqa: E501
80
+ dataset_id='cais/mmlu',
81
+ metric_list=['acc'],
82
+ subset_list=list(SUBJECT_MAPPING.keys()),
83
+ default_subset='all',
84
+ few_shot_num=5,
85
+ train_split='dev',
86
+ eval_split='test',
87
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
88
+ )
152
89
  )
153
- class MMLUAdapter(DataAdapter):
90
+ class MMLUAdapter(MultiChoiceAdapter):
154
91
 
155
92
  def __init__(self, **kwargs):
156
93
 
157
- few_shot_num = kwargs.get('few_shot_num', 5)
158
- if few_shot_num > 5:
159
- logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
160
- kwargs['few_shot_num'] = 5
161
-
162
94
  super().__init__(**kwargs)
163
95
 
96
+ self.reformat_subset = True
164
97
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
165
- self.choices = ['A', 'B', 'C', 'D']
166
-
167
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
168
- data_dict = {}
169
- for subset_name in subset_list:
170
- data_dict[subset_name] = {}
171
-
172
- for split_name in [self.train_split, self.eval_split]:
173
- if split_name == 'train':
174
- split_name_suffix = 'dev'
175
- elif split_name == 'test':
176
- split_name_suffix = 'test'
177
- elif split_name == 'validation':
178
- split_name_suffix = 'val'
179
- else:
180
- raise ValueError(f'Invalid split name: {split_name}')
181
-
182
- if os.path.exists(dataset_name_or_path):
183
- file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
184
- else:
185
- file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
186
-
187
- if os.path.exists(file_path):
188
- with open(file_path, encoding='utf-8') as f:
189
- rows = []
190
- reader = csv.reader(f)
191
- for row in reader:
192
- if len(row) != 6:
193
- logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
194
- continue
195
- rows.append({
196
- 'input': row[0],
197
- 'A': row[1],
198
- 'B': row[2],
199
- 'C': row[3],
200
- 'D': row[4],
201
- 'target': row[5],
202
- })
203
-
204
- data_dict[subset_name].update({split_name: rows})
205
-
206
- return data_dict
207
-
208
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
209
- """
210
- Generate model prompt from raw input, unify the prompt format for MMLU benchmark.
211
-
212
- Args:
213
- input_d (dict): The raw input. A single data format of the MMLU:
214
-
215
- {'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.',
216
- 'A': 'Service quality.',
217
- 'B': 'Service action.',
218
- 'C': 'Service recovery.',
219
- 'D': 'Service satisfaction.',
220
- 'target': 'A'}
221
-
222
- Returns:
223
- {'data': [full_prompt], 'multi_choices': self.choices}
224
-
225
- """
226
- few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
227
-
228
- context: str = '\n'.join(few_shot_prompts) + '\n'
229
- context += self._generate_prompt(input_d=input_d, include_answer=False)
230
-
231
- full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
232
-
233
- return self.gen_prompt_data(full_prompt)
234
-
235
- def get_gold_answer(self, input_d: dict) -> str:
236
- # Get the gold choice
237
- return input_d.get('target', '')
238
-
239
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
240
- """
241
- Parse the model output to get the answer. Could be the best choice index.
242
-
243
- Args:
244
- result: Predicted answer from the model. Usually a string for chat.
245
- raw_input_d: The raw input. Depending on the dataset.
246
- eval_type: 'checkpoint' or 'service' or 'custom'
247
-
248
- Returns:
249
- The parsed answer. Depending on the dataset. Usually a string for chat.
250
- """
251
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
252
- return result
253
- else:
254
- return ResponseParser.parse_first_option(result, options=self.choices)
255
-
256
- def match(self, gold: str, pred: str) -> float:
257
- return exact_match(gold=gold, pred=pred)
258
-
259
- def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
260
-
261
- input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
262
-
263
- example: str = input_d['input']
264
- for j in range(len(self.choices)):
265
- example += f'\n{self.choices[j]}) {input_choices[j]}'
266
-
267
- if include_answer:
268
- example += f"\nAnswer: {input_d['target']}\n\n"
269
- else:
270
- example += '\nAnswer: \n\n'
271
-
272
- return example
273
98
 
274
- @classmethod
275
- def _format_subject(cls, subject):
276
- l = subject.split('_')
277
- s = ''
278
- for entry in l:
279
- s += ' ' + entry
280
- return s
99
+ def record_to_sample(self, record) -> Sample:
100
+ return Sample(
101
+ input=record['question'],
102
+ choices=record['choices'],
103
+ # converts 0 -> A, 1 -> B, etc.
104
+ target=('ABCD'[record['answer']]),
105
+ subset_key=record['subject'],
106
+ metadata={'subject': record['subject']},
107
+ )
@@ -1,10 +1,30 @@
1
- from collections import defaultdict
2
1
  from typing import Any, Dict
3
2
 
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType, OutputType
6
- from evalscope.metrics import exact_match
7
- from evalscope.metrics.completion_parsers import ResponseParser
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+ # Based on the prompt provided here:
12
+ # https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/mmlu_pro
13
+ SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE = """
14
+ The following are multiple choice questions (with answers) about {subject}. Think step by step and then finish your answer with 'ANSWER: $LETTER' (without quotes) where LETTER is the correct letter choice.
15
+
16
+ {examples}
17
+ """.lstrip() # noqa: E501
18
+
19
+ # Based on MultipleChoiceTemplate.SINGLE_ANSWER provided in the multiple choice solver:
20
+ # https://github.com/UKGovernmentBEIS/inspect_ai/blob/main/src/inspect_ai/solver/_multiple_choice.py
21
+ USER_PROMPT_TEMPLATE = """Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
22
+
23
+ Question:
24
+ {question}
25
+ Options:
26
+ {choices}
27
+ """.lstrip() # noqa: E501
8
28
 
9
29
  SUBSET_LIST = [
10
30
  'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
@@ -12,102 +32,63 @@ SUBSET_LIST = [
12
32
  ]
13
33
 
14
34
 
15
- @Benchmark.register(
16
- name='mmlu_pro',
17
- pretty_name='MMLU-Pro',
18
- tags=['MCQ', 'Knowledge'],
19
- description=
20
- 'MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
21
- dataset_id='modelscope/MMLU-Pro',
22
- model_adapter=OutputType.GENERATION,
23
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
24
- subset_list=SUBSET_LIST,
25
- metric_list=['AverageAccuracy'],
26
- few_shot_num=5,
27
- train_split='validation',
28
- eval_split='test',
29
- prompt_template=
30
- 'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
35
+ @register_benchmark(
36
+ BenchmarkMeta(
37
+ name='mmlu_pro',
38
+ pretty_name='MMLU-Pro',
39
+ tags=[Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
40
+ description=
41
+ 'MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
42
+ dataset_id='modelscope/MMLU-Pro',
43
+ subset_list=SUBSET_LIST,
44
+ metric_list=['acc'],
45
+ few_shot_num=5,
46
+ train_split='validation',
47
+ eval_split='test',
48
+ prompt_template=USER_PROMPT_TEMPLATE,
49
+ few_shot_prompt_template=SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE + USER_PROMPT_TEMPLATE,
50
+ )
31
51
  )
32
- class MMLUProAdapter(DataAdapter):
52
+ class MMLUProAdapter(MultiChoiceAdapter):
33
53
 
34
54
  def __init__(self, **kwargs):
35
55
  super().__init__(**kwargs)
36
56
 
37
- self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
38
-
39
- def load(self, **kwargs):
40
- # default load all data
41
- kwargs['subset_list'] = ['default']
42
- data_dict = super().load(**kwargs)
43
- return self.reformat_subset(data_dict, subset_key='category')
44
-
45
- def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
46
- if self.few_shot_num > 0:
47
- prefix = self.format_fewshot_examples(few_shot_list)
48
- else:
49
- prefix = ''
50
- query = prefix + 'Q: ' + input_d['question'] + '\n' + \
51
- self.__form_options(input_d['options']) + '\n'
52
-
53
- full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
54
- return self.gen_prompt_data(full_prompt)
55
-
56
- def format_fewshot_examples(self, few_shot_list):
57
- # load few-shot prompts for each category
58
- prompts = ''
59
- for index, d in enumerate(few_shot_list):
60
- prompts += 'Q: ' + d['question'] + '\n' + \
61
- self.__form_options(d['options']) + '\n' + \
62
- d['cot_content'] + '\n\n'
63
- return prompts
64
-
65
- def __form_options(self, options: list):
66
- option_str = 'Options are:\n'
67
- for opt, choice in zip(options, self.choices):
68
- option_str += f'({choice}): {opt}' + '\n'
69
- return option_str
70
-
71
- def get_gold_answer(self, input_d: dict) -> str:
72
- """
73
- Parse the raw input labels (gold).
74
-
75
- Args:
76
- input_d: input raw data. Depending on the dataset.
77
-
78
- Returns:
79
- The parsed input. e.g. gold answer ... Depending on the dataset.
80
- """
81
- return input_d['answer']
82
-
83
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
84
- """
85
- Parse the predicted result and extract proper answer.
86
-
87
- Args:
88
- result: Predicted answer from the model. Usually a string for chat.
89
- raw_input_d: The raw input. Depending on the dataset.
90
- eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
91
-
92
- Returns:
93
- The parsed answer. Depending on the dataset. Usually a string for chat.
94
- """
95
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
96
- return result
97
- else:
98
- return ResponseParser.parse_first_option(result, options=self.choices)
99
-
100
- def match(self, gold: str, pred: str) -> float:
101
- """
102
- Match the gold answer and the predicted answer.
103
-
104
- Args:
105
- gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
106
- e.g. 'A', extracted from get_gold_answer method.
107
- pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
108
- e.g. 'B', extracted from parse_pred_result method.
109
-
110
- Returns:
111
- The match result. Usually a score (float) for chat/multiple-choice-questions.
112
- """
113
- return exact_match(gold=gold, pred=pred)
57
+ self.reformat_subset = True
58
+
59
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
60
+ return Sample(
61
+ input=record['question'],
62
+ choices=record['options'],
63
+ target=record['answer'],
64
+ subset_key=record['category'].lower(),
65
+ metadata={
66
+ 'cot_content': record['cot_content'],
67
+ 'subject': record['category'].lower(),
68
+ 'question_id': record['question_id'],
69
+ },
70
+ )
71
+
72
+ def sample_to_fewshot(self, sample: Sample) -> str:
73
+ q_str = f"""Question:\n{str(sample.input)}"""
74
+ options = sample.choices if sample.choices is not None else []
75
+ opt_str_list = []
76
+ for i, opt in enumerate(options):
77
+ opt_str_list.append(f"""{chr(65 + i)} {opt}""")
78
+ opt_str = '\n'.join(opt_str_list)
79
+ opt_str = f"""Options:\n{opt_str}"""
80
+ ans_str = sample.metadata['cot_content'] if sample.metadata is not None else ''
81
+ ans_str = ans_str.replace('The answer is', 'ANSWER:')
82
+ ans_opt = ans_str.split('ANSWER:')[-1].split('.')[0].strip().strip('(').strip(')')
83
+ ans_str = ans_str.replace(f'ANSWER: ({ans_opt})', f'ANSWER: {ans_opt}')
84
+ final_str = '\n'.join([q_str, opt_str, ans_str])
85
+
86
+ return final_str
87
+
88
+ def format_fewshot_template(self, fewshot, sample):
89
+ fewshot_str = SYSTEM_W_EXAMPLES_PROMPT_TEMPLATE.format(
90
+ subject=sample.metadata['subject'],
91
+ examples=fewshot,
92
+ )
93
+ prompt_str = self.format_prompt_template(sample)
94
+ return fewshot_str + '\n' + prompt_str
@@ -1,29 +1,14 @@
1
- from collections import defaultdict
2
1
  from typing import Any, Dict
3
2
 
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType, OutputType
6
- from evalscope.metrics import exact_match
7
- from evalscope.metrics.completion_parsers import ResponseParser
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
8
7
  from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
9
9
 
10
10
  logger = get_logger()
11
11
 
12
- SUBSET_LIST = [
13
- 'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
14
- 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
15
- 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
16
- 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
17
- 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
18
- 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
19
- 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
20
- 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
21
- 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
22
- 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
23
- 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
24
- 'world_religions'
25
- ]
26
-
27
12
  SUBJECT_MAPPING = {
28
13
  'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
29
14
  'anatomy': ['Anatomy', 'health', 'Other'],
@@ -84,25 +69,31 @@ SUBJECT_MAPPING = {
84
69
  'world_religions': ['World Religions', 'philosophy', 'Humanities'],
85
70
  }
86
71
 
87
-
88
- @Benchmark.register(
89
- name='mmlu_redux',
90
- pretty_name='MMLU-Redux',
91
- tags=['MCQ', 'Knowledge'],
92
- description=
93
- 'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
94
- dataset_id='AI-ModelScope/mmlu-redux-2.0',
95
- model_adapter=OutputType.GENERATION,
96
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
97
- subset_list=SUBSET_LIST,
98
- metric_list=['AverageAccuracy'],
99
- few_shot_num=0,
100
- train_split=None,
101
- eval_split='test',
102
- prompt_template=
103
- 'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
72
+ SUBSET_LIST = list(SUBJECT_MAPPING.keys())
73
+
74
+
75
+ @register_benchmark(
76
+ BenchmarkMeta(
77
+ name='mmlu_redux',
78
+ pretty_name='MMLU-Redux',
79
+ tags=[Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
80
+ description=
81
+ 'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options. ' # noqa: E501
82
+ 'The bad answers are corrected.', # noqa: E501
83
+ dataset_id='AI-ModelScope/mmlu-redux-2.0',
84
+ subset_list=SUBSET_LIST,
85
+ metric_list=[{
86
+ 'acc': {
87
+ 'allow_inclusion': True
88
+ }
89
+ }],
90
+ few_shot_num=0,
91
+ train_split=None,
92
+ eval_split='test',
93
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
94
+ )
104
95
  )
105
- class MMLUReduxAdapter(DataAdapter):
96
+ class MMLUReduxAdapter(MultiChoiceAdapter):
106
97
 
107
98
  def __init__(self, **kwargs):
108
99
  super().__init__(**kwargs)
@@ -111,75 +102,38 @@ class MMLUReduxAdapter(DataAdapter):
111
102
  self.few_shot_num = 0
112
103
  logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
113
104
 
114
- self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
115
- self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
116
-
117
- def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
118
- if self.few_shot_num > 0:
119
- prefix = self.format_fewshot_examples(few_shot_list)
120
- else:
121
- prefix = ''
122
- query = prefix + 'Q: ' + input_d['question'] + '\n' + \
123
- self.__form_options(input_d['choices']) + '\n'
124
-
125
- full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
126
- return self.gen_prompt_data(full_prompt)
127
-
128
- def format_fewshot_examples(self, few_shot_list):
129
- # load few-shot prompts for each category
130
- prompts = ''
131
- for index, d in enumerate(few_shot_list):
132
- prompts += 'Q: ' + d['question'] + '\n' + \
133
- self.__form_options(d['choices']) + '\n'
134
- return prompts
135
-
136
- def __form_options(self, options: list):
137
- option_str = 'Options are:\n'
138
- for opt, choice in zip(options, self.choices):
139
- option_str += f'({choice}): {opt}' + '\n'
140
- return option_str
141
-
142
- def get_gold_answer(self, input_d: dict) -> str:
143
- """
144
- Parse the raw input labels (gold).
145
-
146
- Args:
147
- input_d: input raw data. Depending on the dataset.
148
-
149
- Returns:
150
- The parsed input. e.g. gold answer ... Depending on the dataset.
151
- """
152
- answer_index = int(input_d['answer'])
153
- return self.choices[answer_index]
154
-
155
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
156
- """
157
- Parse the predicted result and extract proper answer.
158
-
159
- Args:
160
- result: Predicted answer from the model. Usually a string for chat.
161
- raw_input_d: The raw input. Depending on the dataset.
162
- eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
163
-
164
- Returns:
165
- The parsed answer. Depending on the dataset. Usually a string for chat.
166
- """
167
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
168
- return result
169
- else:
170
- return ResponseParser.parse_first_option(result, options=self.choices)
171
-
172
- def match(self, gold: str, pred: str) -> float:
173
- """
174
- Match the gold answer and the predicted answer.
175
-
176
- Args:
177
- gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
178
- e.g. 'A', extracted from get_gold_answer method.
179
- pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
180
- e.g. 'B', extracted from parse_pred_result method.
181
-
182
- Returns:
183
- The match result. Usually a score (float) for chat/multiple-choice-questions.
184
- """
185
- return exact_match(gold=gold, pred=pred)
105
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
106
+ error_type = record['error_type']
107
+ choices = record['choices']
108
+ target_index_list = [int(record['answer'])]
109
+ correct_answer = record['correct_answer']
110
+ if error_type == 'no_correct_answer' and correct_answer:
111
+ choices[target_index_list[0]] = correct_answer
112
+ elif error_type == 'wrong_groundtruth' and correct_answer:
113
+ try:
114
+ target_index_list = [int(correct_answer)]
115
+ except ValueError:
116
+ choice_index = ord(correct_answer) - ord('A')
117
+ target_index_list = [choice_index]
118
+ elif error_type == 'multiple_correct_answers' and correct_answer:
119
+ correct_answer = correct_answer.strip('()')
120
+ try:
121
+ correct_answer = correct_answer.replace(' and ', ',').replace(' or ', ',')
122
+ target_index_list = list(map(int, correct_answer.split(',')))
123
+ except ValueError:
124
+ try:
125
+ target_index_list = [ord(c) - ord('A') for c in correct_answer.split(',')]
126
+ except TypeError:
127
+ # find the index of the correct answer in choices
128
+ target_index_list = [choices.index(c) for c in correct_answer.split(',') if c in choices]
129
+
130
+ return Sample(
131
+ input=record['question'],
132
+ choices=choices,
133
+ target=['ABCD'[i] for i in target_index_list] if target_index_list else ['A', 'B', 'C', 'D'],
134
+ metadata={
135
+ 'error_type': error_type,
136
+ 'correct_answer': correct_answer,
137
+ 'potential_reason': record.get('potential_reason', ''),
138
+ },
139
+ )
File without changes