evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,19 @@
1
+ # flake8: noqa: E501
1
2
  import glob
2
3
  import os
3
4
  from collections import defaultdict
4
- from typing import Any, List
5
-
6
- from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
8
- from evalscope.metrics import Metric, mean, metric_registry
5
+ from typing import Any, Dict, List
6
+
7
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
8
+ from evalscope.api.dataset import DatasetDict, DictDataLoader, Sample
9
+ from evalscope.api.evaluator import TaskState
10
+ from evalscope.api.messages.chat_message import ChatMessageUser
11
+ from evalscope.api.metric import AggScore, SampleScore, Score
12
+ from evalscope.api.registry import register_benchmark
13
+ from evalscope.constants import Tags
9
14
  from evalscope.report import Report, ReportKey
10
15
  from evalscope.utils.logger import get_logger
11
16
 
12
- # flake8: noqa
13
-
14
17
  logger = get_logger()
15
18
 
16
19
  GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." # noqa: E501
@@ -19,59 +22,77 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
19
22
  ) # noqa: E501
20
23
 
21
24
 
22
- @Benchmark.register(
23
- name='general_arena',
24
- pretty_name='GeneralArena',
25
- tags=['Custom', 'Arena'],
26
- description=
27
- 'GeneralArena is a custom benchmark designed to evaluate the performance of large language models in a competitive setting, '
28
- 'where models are pitted against each other in custom tasks to determine their relative strengths and weaknesses. You should '
29
- 'provide the model outputs in the format of a list of dictionaries, where each dictionary contains the model name and its report path. '
30
- 'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/arena.html).',
31
- dataset_id='general_arena',
32
- metric_list=['winrate'],
33
- few_shot_num=0,
34
- train_split=None,
35
- eval_split='test',
36
- system_prompt=GRADER_SYSTEM_PROMPT,
37
- prompt_template=GRADER_TEMPLATE,
38
- extra_params={
39
- 'models': [{
40
- 'name': 'qwen-plus',
41
- 'report_path': 'outputs/20250627_172550/reports/qwen-plus'
42
- }, {
43
- 'name': 'qwen2.5-7b',
44
- 'report_path': 'outputs/20250627_172817/reports/qwen2.5-7b-instruct'
45
- }],
46
- 'baseline':
47
- 'qwen2.5-7b'
48
- })
49
- class GeneralArenaAdapter(DataAdapter):
25
+ @register_benchmark(
26
+ BenchmarkMeta(
27
+ name='general_arena',
28
+ pretty_name='GeneralArena',
29
+ tags=[Tags.CUSTOM, Tags.ARENA],
30
+ description=
31
+ 'GeneralArena is a custom benchmark designed to evaluate the performance of large language models in a competitive setting, '
32
+ 'where models are pitted against each other in custom tasks to determine their relative strengths and weaknesses. You should '
33
+ 'provide the model outputs in the format of a list of dictionaries, where each dictionary contains the model name and its report path. '
34
+ 'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/arena.html).',
35
+ dataset_id='general_arena',
36
+ metric_list=['winrate'],
37
+ few_shot_num=0,
38
+ train_split=None,
39
+ eval_split='test',
40
+ system_prompt=GRADER_SYSTEM_PROMPT,
41
+ prompt_template=GRADER_TEMPLATE,
42
+ extra_params={
43
+ 'models': [{
44
+ 'name': 'qwen-plus',
45
+ 'report_path': 'outputs/20250627_172550/reports/qwen-plus'
46
+ }, {
47
+ 'name': 'qwen2.5-7b',
48
+ 'report_path': 'outputs/20250627_172817/reports/qwen2.5-7b-instruct'
49
+ }],
50
+ 'baseline':
51
+ 'qwen2.5-7b'
52
+ }
53
+ )
54
+ )
55
+ class GeneralArenaAdapter(DefaultDataAdapter):
50
56
 
51
57
  def __init__(self, *args, **kwargs):
52
58
  super().__init__(*args, **kwargs)
53
59
 
54
- # register metrics
55
- metric_registry.register(Metric(name='winrate', object=mean))
60
+ self._use_llm_judge = True
56
61
 
57
- # whether to use LLM as a judge
58
- self.llm_as_a_judge = True
62
+ self.models = self.extra_params.get('models', [])
63
+ self.baseline = self.extra_params.get('baseline', None)
59
64
 
60
- extra_params = kwargs.get('extra_params', {})
61
- self.models = extra_params.get('models', [])
62
- self.baseline = extra_params.get('baseline', None)
63
-
64
- def load(self, **kwargs):
65
+ def load(self):
66
+ """Load dataset by processing model reports."""
65
67
  self._check_names()
66
68
  self._check_reports()
67
69
  self._check_datasets()
68
70
  logger.info(f'Overall datasets: {self.overall_datasets}')
69
71
  dataset_model_dict = self._load_common_datasets()
70
- data_dict = self._build_pair_wise_data(dataset_model_dict)
71
- return data_dict
72
-
73
- def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
74
- return self.gen_prompt_data(input_d['question'])
72
+ datasets = self._build_pair_wise_data(dataset_model_dict)
73
+
74
+ # Convert to DatasetDict format
75
+ dataset_dict = {}
76
+ for subset_name, samples in datasets.items():
77
+ dataset = DictDataLoader(
78
+ dict_list=samples, limit=self.limit, repeats=self.repeats, sample_fields=self.record_to_sample
79
+ ).load()
80
+ dataset_dict[subset_name] = dataset
81
+
82
+ test_dataset = DatasetDict(dataset_dict)
83
+ return test_dataset, None
84
+
85
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
86
+ """Convert a data record to a Sample object."""
87
+ return Sample(
88
+ input=[ChatMessageUser(content=record['question'])],
89
+ target=record['answer_2'], # baseline answer
90
+ metadata={
91
+ 'answer_1': record['answer_1'],
92
+ 'model_1': record['model_1'],
93
+ 'model_2': record['model_2'],
94
+ }
95
+ )
75
96
 
76
97
  def _check_names(self):
77
98
  """Check the names of the models and baseline."""
@@ -119,7 +140,8 @@ class GeneralArenaAdapter(DataAdapter):
119
140
 
120
141
  def _load_common_datasets(self):
121
142
  """Load common datasets from the local path."""
122
- from evalscope.utils import OutputsStructure, jsonl_to_list
143
+ from evalscope.utils import OutputsStructure
144
+ from evalscope.utils.io_utils import jsonl_to_list
123
145
 
124
146
  dataset_dict = defaultdict(dict)
125
147
  for dataset_name, subset_name in self.overall_datasets:
@@ -128,7 +150,8 @@ class GeneralArenaAdapter(DataAdapter):
128
150
  dataset_file_path = os.path.join(dataset_path, f'{dataset_name}_{subset_name}.jsonl')
129
151
  if not os.path.exists(dataset_file_path):
130
152
  raise ValueError(
131
- f'Dataset {dataset_name} with subset {subset_name} not found in model {model["name"]}.')
153
+ f'Dataset {dataset_name} with subset {subset_name} not found in model {model["name"]}.'
154
+ )
132
155
  dataset = jsonl_to_list(dataset_file_path)
133
156
  # sort by index
134
157
  dataset.sort(key=lambda x: x.get('index'))
@@ -138,9 +161,10 @@ class GeneralArenaAdapter(DataAdapter):
138
161
 
139
162
  def _build_pair_wise_data(self, dataset_dict):
140
163
  """Build pairwise data for the models."""
164
+ from evalscope.api.evaluator import ReviewResult
141
165
  from .utils import process_review_item
142
166
 
143
- pairwise_data = defaultdict(dict)
167
+ pairwise_data = defaultdict(list)
144
168
  for (dataset_name, subset_name), model_data in dataset_dict.items():
145
169
  if len(model_data) < 2:
146
170
  logger.warning(f'Not enough models for dataset {dataset_name} with subset {subset_name}. Skipping.')
@@ -152,8 +176,13 @@ class GeneralArenaAdapter(DataAdapter):
152
176
  continue
153
177
  pairs = []
154
178
  for model_item, baseline_item in zip(model_data[name], model_data[self.baseline]):
179
+ # Convert to ReviewResult objects like in get_model_prediction
180
+ model_review = ReviewResult.model_validate(model_item)
181
+ baseline_review = ReviewResult.model_validate(baseline_item)
182
+
155
183
  for model_choice, baseline_choice in zip(
156
- process_review_item(model_item), process_review_item(baseline_item)):
184
+ process_review_item(model_review), process_review_item(baseline_review)
185
+ ):
157
186
  pairs.append({
158
187
  'question': model_choice['Question'],
159
188
  'answer_1': model_choice['Generated'],
@@ -161,23 +190,26 @@ class GeneralArenaAdapter(DataAdapter):
161
190
  'model_1': name,
162
191
  'model_2': self.baseline
163
192
  })
164
- pairwise_data[f'{dataset_name}&{subset_name}@{name}&{self.baseline}'][self.eval_split] = pairs
193
+ pairwise_data[f'{dataset_name}&{subset_name}@{name}&{self.baseline}'] = pairs
165
194
 
166
195
  return pairwise_data
167
196
 
168
- def llm_match(self, gold, pred, judge=None, **kwargs):
197
+ def llm_match_score(
198
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
199
+ ) -> Score:
200
+ """Use LLM as a judge to evaluate the predicted answer against the baseline."""
169
201
  from .utils import get_judge_score, post_process_result
170
202
 
171
- try:
172
- raw_input = kwargs.get('raw_input', None)
173
- question = raw_input['question']
174
- answer_1 = raw_input['answer_1']
175
- answer_2 = raw_input['answer_2']
176
- model_1 = raw_input['model_1']
177
- model_2 = raw_input['model_2']
178
- except KeyError as e:
179
- logger.error(f'Missing key in raw input: {e}. Raw input: {raw_input}')
180
- raise
203
+ score = Score(
204
+ extracted_prediction=filtered_prediction,
205
+ prediction=original_prediction,
206
+ )
207
+
208
+ question = task_state.input_text
209
+ answer_1 = task_state.metadata['answer_1']
210
+ answer_2 = reference # baseline answer
211
+ model_1 = task_state.metadata['model_1']
212
+ model_2 = task_state.metadata['model_2']
181
213
 
182
214
  system_template = self.system_prompt
183
215
  prompt_template = self.prompt_template
@@ -185,9 +217,11 @@ class GeneralArenaAdapter(DataAdapter):
185
217
  prompt1 = prompt_template.format(question=question, answer_1=answer_1, answer_2=answer_2)
186
218
  # reverse the order
187
219
  prompt2 = prompt_template.format(question=question, answer_1=answer_2, answer_2=answer_1)
220
+
188
221
  # get grading response
189
- game1_response = judge(prompt1, system_prompt=system_template)
190
- game2_response = judge(prompt2, system_prompt=system_template)
222
+ game1_response = self.llm_judge.judge(prompt1, system_prompt=system_template)
223
+ game2_response = self.llm_judge.judge(prompt2, system_prompt=system_template)
224
+
191
225
  # parse grading response
192
226
  # game1
193
227
  res1 = post_process_result(game1_response)
@@ -195,9 +229,9 @@ class GeneralArenaAdapter(DataAdapter):
195
229
  # game2
196
230
  res2 = post_process_result(game2_response)
197
231
  score2 = get_judge_score(res2, reverse=True)
198
- return {
199
- 'score':
200
- mean([score1, score2]),
232
+
233
+ battle_result = {
234
+ 'score': (score1 + score2) / 2,
201
235
  'games': [
202
236
  {
203
237
  'model_a': model_1,
@@ -214,31 +248,38 @@ class GeneralArenaAdapter(DataAdapter):
214
248
  ]
215
249
  }
216
250
 
217
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
218
- """
219
- compute score of the model
220
- """
251
+ score.value = {'score': battle_result['score']}
252
+ score.explanation = f'LLM judge battles: Game1: {game1_response[:100]}... Game2: {game2_response[:100]}...'
253
+ score.metadata = {
254
+ 'source': 'llm_judge',
255
+ 'judge_strategy': getattr(self, 'judge_strategy', 'default'),
256
+ 'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown',
257
+ 'battle_result': battle_result
258
+ }
259
+ score.main_score_name = 'score'
260
+
261
+ return score
262
+
263
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
264
+ """Aggregate scores to compute winrate."""
221
265
  import numpy as np
222
266
  import pandas as pd
223
267
 
224
268
  from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
225
269
 
226
- if isinstance(review_res_list[0], list):
227
- review_res_list = [item for sublist in review_res_list for item in sublist]
228
-
229
- battles = pd.concat([get_battles_from_row(res) for res in review_res_list])
270
+ battles = pd.concat([get_battles_from_row(res.score.metadata['battle_result']) for res in sample_scores])
230
271
 
231
272
  bt_model_coef = compute_mle_elo(battles, baseline_model=self.baseline)
232
273
 
233
274
  bootstrap_model_coef = get_bootstrap_result(
234
- battles, func_compute_elo=compute_mle_elo, num_round=100, baseline_model=self.baseline)
275
+ battles, func_compute_elo=compute_mle_elo, num_round=100, baseline_model=self.baseline
276
+ )
235
277
 
236
278
  stats = pd.DataFrame()
237
279
  stats['results'] = None
238
280
  stats['results'] = stats['results'].astype('object')
239
281
 
240
282
  for i, model in enumerate(bt_model_coef.index):
241
- # assert model in bootstrap_elo_lu.columns
242
283
  stats.at[i, 'model'] = model
243
284
  stats.at[i, 'score'] = bt_model_coef[model]
244
285
  stats.at[i, 'lower'] = np.percentile(bootstrap_model_coef[model], 2.5)
@@ -249,20 +290,25 @@ class GeneralArenaAdapter(DataAdapter):
249
290
  metrics_dict['winrate_lower'] = get_win_rate_column(stats, 'lower', self.baseline).to_dict()
250
291
  metrics_dict['winrate_upper'] = get_win_rate_column(stats, 'upper', self.baseline).to_dict()
251
292
 
252
- metrics = []
293
+ agg_scores = []
253
294
  for metric_name, models in metrics_dict.items():
254
- for model_name, score in models.items():
295
+ for model_name, score_val in models.items():
255
296
  if model_name == self.baseline:
256
297
  continue
257
- metrics.append({'metric_name': metric_name, 'score': score, 'num': len(review_res_list)})
258
- return metrics
298
+ agg_scores.append(AggScore(score=score_val, metric_name=metric_name, num=len(sample_scores)))
299
+
300
+ return agg_scores
259
301
 
260
- def post_process_report(self, report: 'Report', **kwargs):
302
+ def extract_answer(self, prediction, task_state):
303
+ # NOTE: This is a hacky way to extract the answer from the prediction
304
+ return task_state.metadata['answer_1']
305
+
306
+ def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
261
307
  """Post-process the report to convert it to a DataFrame with winrate leaderboards."""
262
308
  import pandas as pd
263
309
  import tabulate
264
310
 
265
- report_path = kwargs.get('report_path')
311
+ report_path = output_dir
266
312
  leaderboard_file = os.path.join(report_path, 'leaderboard.txt')
267
313
 
268
314
  # Ensure report directory exists
@@ -288,7 +334,8 @@ class GeneralArenaAdapter(DataAdapter):
288
334
  """Format DataFrame as leaderboard with CI."""
289
335
  # Pivot to get winrate, winrate_lower, winrate_upper as columns
290
336
  pivot_df = data_df.pivot_table(
291
- index=[ReportKey.model_name], columns=ReportKey.metric_name, values=ReportKey.score, aggfunc='first')
337
+ index=[ReportKey.model_name], columns=ReportKey.metric_name, values=ReportKey.score, aggfunc='first'
338
+ )
292
339
 
293
340
  # Add baseline model with 50% winrate
294
341
  baseline_data = {'winrate': 0.5, 'winrate_lower': 0.5, 'winrate_upper': 0.5}
@@ -392,20 +439,11 @@ class GeneralArenaAdapter(DataAdapter):
392
439
  subset_df = parsed_df[(parsed_df['dataset_name'] == dataset_name)
393
440
  & (parsed_df['subset_name'] == subset_name)]
394
441
  leaderboard_outputs.append(
395
- format_leaderboard(subset_df, f'=== SUBSET LEADERBOARD: {dataset_name} - {subset_name} ==='))
442
+ format_leaderboard(subset_df, f'=== SUBSET LEADERBOARD: {dataset_name} - {subset_name} ===')
443
+ )
396
444
 
397
445
  # Write all leaderboard outputs to file
398
446
  with open(leaderboard_file, 'w', encoding='utf-8') as f:
399
447
  f.write('\n'.join(leaderboard_outputs))
400
448
 
401
449
  logger.info(f'Leaderboard results saved to: {leaderboard_file}')
402
-
403
- def get_gold_answer(self, input_d):
404
- return f"model_1: {input_d['model_1']}\n---\n" + input_d['answer_1']
405
-
406
- def llm_parse_pred_result(self, result, raw_input_d=None, eval_type=EvalType.CHECKPOINT):
407
- return f"model_2: {raw_input_d['model_2']}\n---\n" + raw_input_d['answer_2']
408
-
409
- def match(self, gold, pred):
410
- logger.warning(f'Please use LLMJudge to match the result for {self.name}')
411
- return
@@ -7,44 +7,39 @@ from collections import defaultdict
7
7
  from sklearn.linear_model import LogisticRegression
8
8
  from tqdm import tqdm
9
9
 
10
+ from evalscope.api.evaluator import ReviewResult
10
11
  from evalscope.utils.logger import get_logger
11
12
 
12
13
  logger = get_logger()
13
14
 
14
15
 
15
- def process_review_item(review_item: dict) -> dict:
16
+ def process_review_item(review_result: ReviewResult) -> list:
16
17
  """
17
- Process a single review item to extract relevant information.
18
+ Process a ReviewResult object to extract relevant information.
18
19
 
19
20
  Args:
20
- review_item (dict): The review item to process.
21
+ review_result: ReviewResult object or dict (for backward compatibility)
21
22
 
22
23
  Returns:
23
- dict: Processed review item with necessary information.
24
+ list: List of processed review items with necessary information.
24
25
  """
25
- res = []
26
- raw_input = review_item['raw_input']
27
- sample_index = review_item['index']
28
- question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
29
- # Find the first non-empty question key in raw_input
30
- question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
31
- for choice_index, choice in enumerate(review_item['choices']):
32
- raw_pred_answer = choice['message']['content']
33
- parsed_gold_answer = choice['review']['gold']
34
- parsed_pred_answer = choice['review']['pred']
35
- score = choice['review']['result']
36
- raw_d = {
37
- 'Index': f'{sample_index}_{choice_index}',
38
- 'Input': raw_input,
39
- 'Question': question if question else '*No Question*',
40
- 'Generated': raw_pred_answer,
41
- 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
42
- 'Pred': parsed_pred_answer,
43
- 'Score': score,
44
- }
45
- res.append(raw_d)
46
26
 
47
- return res
27
+ # New format using ReviewResult
28
+ sample_score = review_result.sample_score
29
+ prediction = sample_score.score.prediction
30
+ target = review_result.target
31
+ extracted_prediction = sample_score.score.extracted_prediction
32
+
33
+ raw_d = {
34
+ 'Index': str(review_result.index),
35
+ 'Input': review_result.input,
36
+ 'Question': review_result.input, # Use input as question
37
+ 'Generated': prediction if prediction != extracted_prediction else extracted_prediction,
38
+ 'Gold': target,
39
+ 'Pred': extracted_prediction,
40
+ 'Score': sample_score.score.model_dump(exclude_none=True),
41
+ }
42
+ return [raw_d]
48
43
 
49
44
 
50
45
  def post_process_result(completion):
@@ -179,7 +174,8 @@ def compute_mle_elo(df, scale=400, base=10, init_rating=1000, baseline_model='gp
179
174
  return elo_scores.sort_values(ascending=False)
180
175
 
181
176
  lr = LogisticRegression(
182
- fit_intercept=False, penalty=None, tol=1e-8) # May need to set a small value when not use GPT4 as judge model
177
+ fit_intercept=False, penalty=None, tol=1e-8
178
+ ) # May need to set a small value when not use GPT4 as judge model
183
179
  lr.fit(X, Y)
184
180
 
185
181
  elo_scores = scale * lr.coef_[0] + init_rating
@@ -2,118 +2,57 @@
2
2
  import os
3
3
  from collections import defaultdict
4
4
 
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType, OutputType
7
- from evalscope.metrics import exact_match
8
- from evalscope.metrics.completion_parsers import ResponseParser
5
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
9
  from evalscope.utils.io_utils import csv_to_list, jsonl_to_list
10
10
  from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
11
12
 
12
13
  # flake8: noqa
13
14
 
14
15
  logger = get_logger()
15
16
 
16
17
 
17
- @Benchmark.register(
18
- name='general_mcq',
19
- pretty_name='General-MCQ',
20
- description='A general multiple-choice question answering dataset for custom evaluation. '
21
- 'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#mcq).',
22
- tags=['MCQ', 'Custom'],
23
- dataset_id='general_mcq',
24
- model_adapter=OutputType.GENERATION,
25
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
26
- subset_list=['default'],
27
- metric_list=['AverageAccuracy'],
28
- few_shot_num=0,
29
- train_split='dev',
30
- eval_split='val',
31
- prompt_template='请回答问题,并选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
32
- query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
33
- class GeneralMCQAdapter(DataAdapter):
18
+ @register_benchmark(
19
+ BenchmarkMeta(
20
+ name='general_mcq',
21
+ pretty_name='General-MCQ',
22
+ description='A general multiple-choice question answering dataset for custom evaluation. '
23
+ 'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#mcq).',
24
+ tags=[Tags.MULTIPLE_CHOICE, Tags.CUSTOM],
25
+ dataset_id='general_mcq',
26
+ subset_list=['default'],
27
+ metric_list=['acc'],
28
+ few_shot_num=0,
29
+ train_split='dev',
30
+ eval_split='val',
31
+ prompt_template=MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE,
32
+ )
33
+ )
34
+ class GeneralMCQAdapter(MultiChoiceAdapter):
34
35
 
35
36
  def __init__(self, **kwargs):
36
37
  super().__init__(**kwargs)
37
38
 
38
39
  self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
39
40
 
40
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
41
- data_dict = defaultdict(dict)
42
- for subset_name in subset_list:
43
- for split_name in [self.train_split, self.eval_split]:
44
- # Check for files with different extensions
45
- for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
46
- if os.path.exists(dataset_name_or_path):
47
- file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
48
- else:
49
- file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
50
-
51
- if os.path.exists(file_path):
52
- data_dict[subset_name][split_name] = loader(file_path)
53
- break # Stop checking other extensions once a file is found
54
-
55
- return dict(data_dict)
56
-
57
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
58
- """
59
- Generate model prompt from raw input, unify the prompt format for C-Eval benchmark.
60
-
61
- Args:
62
- input_d (dict): The raw input. A single data format of the C-Eval:
63
-
64
- {'id': 0,
65
- 'question': '下列关于税法基本原则的表述中,不正确的是____。',
66
- 'A': '税收法定原则包括税收要件法定原则和税务合法性原则',
67
- 'B': '税收公平原则源于法律上的平等性原则',
68
- 'C': '税收效率原则包含经济效率和行政效率两个方面',
69
- 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定',
70
- 'answer': 'D'}
71
-
72
- Returns:
73
- {'data': ['prompt ...']}
74
- """
75
-
76
- few_shot_prompts = [self._format_example(input_d=sample, include_answer=True) for sample in few_shot_list]
77
-
78
- if len(few_shot_prompts) > 0:
79
- context: str = '\n'.join(few_shot_prompts) + '\n'
80
- else:
81
- context = ''
82
- context = context.strip() + self._format_example(input_d=input_d, include_answer=False)
83
-
84
- full_prompt = self.prompt_template.format(query=context)
85
-
86
- return self.gen_prompt_data(full_prompt)
87
-
88
- def get_gold_answer(self, input_d: dict) -> str:
89
- # Get the gold choice
90
- return input_d.get('answer', '')
91
-
92
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
93
- """
94
- Parse the model output to get the answer. Could be the best choice index.
95
-
96
- Args:
97
- result: Predicted answer from the model. Usually a string for chat.
98
- raw_input_d (dict): The raw input. Depending on the dataset.
99
- eval_type: `checkpoint` or `service` or `custom`. Default is `checkpoint`.
100
-
101
- Returns:
102
- The parsed answer. Depending on the dataset. Usually a string for chat.
103
- """
104
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
105
- return result
106
- else:
107
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
108
-
109
- def match(self, gold: str, pred: str) -> float:
110
- return exact_match(gold=gold, pred=pred)
111
-
112
- def _format_example(self, input_d: dict, include_answer=True):
113
- choices_str = '\n'.join([f'{choice}. {input_d[choice]}' for choice in self.choices if choice in input_d])
114
-
115
- if include_answer:
116
- return self.query_template.format(
117
- question=input_d['question'], choices=choices_str, answer=input_d['answer'])
118
- else:
119
- return self.query_template.format(question=input_d['question'], choices=choices_str, answer='').rstrip()
41
+ def load_from_disk(self, **kwargs):
42
+ return super().load_from_disk(use_local_loader=True)
43
+
44
+ def record_to_sample(self, record) -> Sample:
45
+ # Extract choices from the record (A, B, C, D, etc.)
46
+ choices = []
47
+ for choice_key in self.choices:
48
+ if choice_key in record:
49
+ choices.append(record[choice_key])
50
+ else:
51
+ break # Stop when we reach a choice key that doesn't exist
52
+
53
+ return Sample(
54
+ input=record['question'],
55
+ choices=choices,
56
+ target=record['answer'],
57
+ metadata={'id': record.get('id', 'unknown')},
58
+ )