evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,12 +1,14 @@
1
1
  import re
2
- from typing import Any, List
3
-
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
6
10
  from evalscope.utils.logger import get_logger
7
11
 
8
- # flake8: noqa
9
-
10
12
  logger = get_logger()
11
13
 
12
14
  GRADER_TEMPLATE = """
@@ -40,7 +42,7 @@ GRADER_TEMPLATE = """
40
42
  模型预测6:你可能是想说贝茜和奥利维亚。不过您应通过最新的参考资料确认详细信息。那是正确的答案吗?
41
43
  ```
42
44
  这些答复均为【错误】,因为:
43
- - 答复中包含与标准答案矛盾的事实陈述。即使在陈述中略带保留(例如:“可能是”,“虽然我不确定,但我认为”),也视为错误。
45
+ - 答复中包含与标准答案矛盾的事实陈述。即使在陈述中略带保留(例如:"可能是","虽然我不确定,但我认为"),也视为错误。
44
46
 
45
47
  以下是【未尝试】的答复示例:
46
48
  ```
@@ -56,16 +58,16 @@ GRADER_TEMPLATE = """
56
58
  - 回复中没有与标准答案矛盾的陈述。
57
59
 
58
60
  另外注意以下几点:
59
- - 对于标准答案为数字的问题,预测答案应和标准答案一致。例如,考虑问题“金山铁路黄浦江特大桥的全长是多少米?”,标准答案为“3518.17”:
60
- - 预测答案“3518”、“3518.1”、“3518.17”均为【正确】。
61
- - 预测答案“3520”和“3600”均为【错误】。
62
- - 预测答案“大约3500米”和“超过3000米”被视为【未尝试】,因为它们既不确认也不与标准答案矛盾。
61
+ - 对于标准答案为数字的问题,预测答案应和标准答案一致。例如,考虑问题"金山铁路黄浦江特大桥的全长是多少米?",标准答案为"3518.17":
62
+ - 预测答案"3518"、"3518.1"、"3518.17"均为【正确】。
63
+ - 预测答案"3520"和"3600"均为【错误】。
64
+ - 预测答案"大约3500米"和"超过3000米"被视为【未尝试】,因为它们既不确认也不与标准答案矛盾。
63
65
  - 如果标准答案包含比问题更多的信息,预测答案只需包含问题中提到的信息。
64
- - 例如,考虑问题“菱镁矿的主要化学成分是什么?”标准答案为“碳酸镁(MgCO3)”。“碳酸镁”或“MgCO3”均视为【正确】答案。
66
+ - 例如,考虑问题"菱镁矿的主要化学成分是什么?"标准答案为"碳酸镁(MgCO3)"。"碳酸镁"或"MgCO3"均视为【正确】答案。
65
67
  - 如果从问题中明显可以推断出预测答案省略的信息,那么算作正确。
66
- - 例如,问题“巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产,那么这遗址在哪个地区?”标准答案为“意大利撒丁岛”,预测答案“撒丁岛”被视为【正确】。
68
+ - 例如,问题"巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产,那么这遗址在哪个地区?"标准答案为"意大利撒丁岛",预测答案"撒丁岛"被视为【正确】。
67
69
  - 如果能明显看出名字翻译版本不同但是是同一个人也认为正确。
68
- - 例如,如果标准答案是“Robinson”,那么回答鲁滨逊或者鲁滨孙均正确。
70
+ - 例如,如果标准答案是"Robinson",那么回答鲁滨逊或者鲁滨孙均正确。
69
71
 
70
72
  下面是一个新的问题示例。请只回复A、B、C之一,不要道歉或纠正自己的错误,只需要评估该回答。
71
73
  ```
@@ -80,88 +82,89 @@ B:【错误】
80
82
  C:【未尝试】
81
83
 
82
84
  只返回字母"A"、"B"或"C",无须添加其他文本。
83
- """.strip() # noqa E501
85
+ """.strip()
84
86
 
85
87
  SUBSET_LIST = ['中华文化', '人文与社会科学', '工程、技术与应用科学', '生活、艺术与文化', '社会', '自然与自然科学']
86
88
 
87
89
 
88
- @Benchmark.register(
89
- name='chinese_simpleqa',
90
- pretty_name='Chinese-SimpleQA',
91
- tags=['Knowledge', 'QA', 'Chinese'],
92
- description=
93
- "Chinese SimpleQA is a Chinese question-answering dataset designed to evaluate the performance of language models on simple factual questions. It includes a variety of topics and is structured to test the model's ability to understand and generate correct answers in Chinese.", # noqa: E501
94
- subset_list=SUBSET_LIST,
95
- dataset_id='AI-ModelScope/Chinese-SimpleQA',
96
- metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
97
- few_shot_num=0,
98
- train_split=None,
99
- eval_split='train')
100
- class ChineseSimpleQAAdapter(DataAdapter):
90
+ @register_benchmark(
91
+ BenchmarkMeta(
92
+ name='chinese_simpleqa',
93
+ pretty_name='Chinese-SimpleQA',
94
+ tags=[Tags.KNOWLEDGE, Tags.QA, Tags.CHINESE],
95
+ description=
96
+ "Chinese SimpleQA is a Chinese question-answering dataset designed to evaluate the performance of language models on simple factual questions. It includes a variety of topics and is structured to test the model's ability to understand and generate correct answers in Chinese.", # noqa: E501
97
+ subset_list=SUBSET_LIST,
98
+ dataset_id='AI-ModelScope/Chinese-SimpleQA',
99
+ metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
100
+ few_shot_num=0,
101
+ train_split=None,
102
+ eval_split='train',
103
+ prompt_template='请回答问题:\n\n{question}'
104
+ )
105
+ )
106
+ class ChineseSimpleQAAdapter(DefaultDataAdapter):
101
107
 
102
108
  def __init__(self, *args, **kwargs):
103
109
  super().__init__(*args, **kwargs)
104
110
 
105
- # register metrics
106
- metric_registry.register(Metric(name='is_correct', object=mean))
107
- metric_registry.register(Metric(name='is_incorrect', object=mean))
108
- metric_registry.register(Metric(name='is_not_attempted', object=mean))
109
-
110
- # whether to use LLM as a judge
111
- self.llm_as_a_judge = True
112
-
113
- def load(self, **kwargs):
114
- kwargs['subset_list'] = ['default']
115
- data_dict = super().load(**kwargs)
116
- return self.reformat_subset(data_dict, subset_key='primary_category', format='{}')
117
-
118
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
119
- question = input_d['question']
120
- return self.gen_prompt_data(question)
121
-
122
- def get_gold_answer(self, input_d: dict) -> str:
123
- return input_d['answer']
124
-
125
- def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
126
- return result.strip()
127
-
128
- def match(self, gold: str, pred: str) -> float:
129
- # simple match
130
- logger.warning(f'Please use LLMJudge to match the result for {self.name}')
131
- is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
132
- is_incorrect = not is_correct
133
- is_not_attempted = 0
134
- return {
135
- 'is_correct': is_correct,
136
- 'is_incorrect': is_incorrect,
137
- 'is_not_attempted': is_not_attempted,
111
+ self._use_llm_judge = True # Use LLM as a judge by default
112
+ self.reformat_subset = True # Reformat subset to primary_category
113
+
114
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
115
+ """
116
+ Convert a data record to a Sample object.
117
+
118
+ Args:
119
+ record (Dict[str, Any]): Input data record.
120
+
121
+ Returns:
122
+ Sample: Sample object with input, target, and metadata.
123
+ """
124
+ question = record['question']
125
+ answer = record['answer']
126
+ subset_key = record.get('primary_category', 'default') # Use primary_category as subset key
127
+ metadata = {
128
+ 'id': record.get('id', 'unknown'),
129
+ 'primary_category': subset_key,
130
+ 'secondary_category': record.get('secondary_category', '')
138
131
  }
139
132
 
140
- def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
141
- raw_input = kwargs.get('raw_input', None)
142
- question = raw_input['question']
143
- # get grading response
144
- prompt = GRADER_TEMPLATE.format(question=question, target=gold, predicted_answer=pred)
133
+ return Sample(input=question, target=answer, subset_key=subset_key, metadata=metadata)
134
+
135
+ def llm_match_score(
136
+ self,
137
+ original_prediction: str,
138
+ filtered_prediction: str,
139
+ reference: str,
140
+ task_state: TaskState,
141
+ ) -> Score:
142
+ score = Score(
143
+ extracted_prediction=filtered_prediction,
144
+ prediction=original_prediction,
145
+ )
146
+
147
+ question = task_state.input_text
148
+
149
+ # Request judge and obtain score
150
+ prompt = GRADER_TEMPLATE.format(question=question, target=reference, predicted_answer=filtered_prediction)
145
151
  system_prompt = '你是一个智能助手,请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。'
146
- grading_response = judge(prompt, system_prompt)
152
+ judge_response = self.llm_judge.judge(prompt, system_prompt=system_prompt)
147
153
  # parse grading response
148
- match = re.search(r'(A|B|C)', grading_response)
154
+ match = re.search(r'(A|B|C)', judge_response)
149
155
  res = match.group(0) if match else 'C'
150
- return {
156
+
157
+ # Set score based on the match result
158
+ score.value = {
151
159
  'is_correct': 1 if res == 'A' else 0,
152
160
  'is_incorrect': 1 if res == 'B' else 0,
153
161
  'is_not_attempted': 1 if res == 'C' else 0,
154
- 'judge_response': grading_response,
155
162
  }
156
-
157
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
158
- """
159
- compute weighted mean of the bleu score of all samples
160
-
161
- Args:
162
- review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
163
- """
164
- # zip dict answers
165
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
166
-
167
- return super().compute_metric(res_dict, **kwargs)
163
+ score.explanation = f'LLM judge: {judge_response}'
164
+ score.metadata = {
165
+ 'source': 'llm_judge',
166
+ 'judge_strategy': self.judge_strategy,
167
+ 'model': self.llm_judge.model_id
168
+ }
169
+ score.main_score_name = 'is_correct'
170
+ return score
@@ -1,36 +1,16 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import csv
4
- import os
5
- from collections import defaultdict
6
-
7
- from evalscope.benchmarks import Benchmark, DataAdapter
8
- from evalscope.constants import EvalType, OutputType
9
- from evalscope.metrics import exact_match
10
- from evalscope.metrics.completion_parsers import ResponseParser
11
- from evalscope.utils.io_utils import csv_to_list
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
12
7
  from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
13
9
 
14
10
  # flake8: noqa
15
11
 
16
12
  logger = get_logger()
17
13
 
18
- SUBSET_LIST = [
19
- 'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
20
- 'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
21
- 'chinese_teacher_qualification', 'college_actuarial_science', 'college_education', 'college_engineering_hydrology',
22
- 'college_law', 'college_mathematics', 'college_medical_statistics', 'clinical_knowledge', 'college_medicine',
23
- 'computer_science', 'computer_security', 'conceptual_physics', 'construction_project_management', 'economics',
24
- 'education', 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology',
25
- 'electrical_engineering', 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts',
26
- 'high_school_biology', 'high_school_chemistry', 'high_school_geography', 'high_school_mathematics',
27
- 'high_school_physics', 'high_school_politics', 'human_sexuality', 'international_law', 'journalism',
28
- 'jurisprudence', 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing',
29
- 'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', 'professional_law',
30
- 'professional_medicine', 'professional_psychology', 'public_relations', 'security_study', 'sociology',
31
- 'sports_science', 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions'
32
- ]
33
-
34
14
  SUBJECT_MAPPING = {
35
15
  'agronomy': ['other', 'Other'],
36
16
  'anatomy': ['biology', 'STEM'],
@@ -102,112 +82,41 @@ SUBJECT_MAPPING = {
102
82
  }
103
83
 
104
84
 
105
- @Benchmark.register(
106
- name='cmmlu',
107
- pretty_name='C-MMLU',
108
- tags=['Knowledge', 'MCQ', 'Chinese'],
109
- description=
110
- 'C-MMLU is a benchmark designed to evaluate the performance of AI models on Chinese language tasks, including reading comprehension, text classification, and more.',
111
- dataset_id='modelscope/cmmlu',
112
- model_adapter=OutputType.GENERATION,
113
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
114
- subset_list=SUBSET_LIST,
115
- metric_list=['AverageAccuracy'],
116
- few_shot_num=5,
117
- train_split='dev',
118
- eval_split='test',
119
- prompt_template=
120
- '以下是关于{subset_name}的单项选择题,请给出正确答案的选项。你的回答的最后一行应该是这样的格式:“答案:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
85
+ @register_benchmark(
86
+ BenchmarkMeta(
87
+ name='cmmlu',
88
+ pretty_name='C-MMLU',
89
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.CHINESE],
90
+ description=
91
+ 'C-MMLU is a benchmark designed to evaluate the performance of AI models on Chinese language tasks, including reading comprehension, text classification, and more.',
92
+ dataset_id='evalscope/cmmlu',
93
+ metric_list=['acc'],
94
+ subset_list=list(SUBJECT_MAPPING.keys()),
95
+ few_shot_num=0,
96
+ train_split=None,
97
+ eval_split='test',
98
+ prompt_template=MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
99
+ )
121
100
  )
122
- class CMMLUAdapter(DataAdapter):
101
+ class CMMLUAdapter(MultiChoiceAdapter):
123
102
 
124
103
  def __init__(self, **kwargs):
125
104
  super().__init__(**kwargs)
126
105
 
106
+ self.reformat_subset = True
127
107
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
128
- self.choices = ['A', 'B', 'C', 'D']
129
-
130
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
131
- data_dict = defaultdict(dict)
132
- for subset_name in subset_list:
133
- for split_name in [self.train_split, self.eval_split]:
134
- if os.path.exists(dataset_name_or_path):
135
- file_path = os.path.join(dataset_name_or_path, split_name, f'{subset_name}.csv')
136
- else:
137
- file_path = os.path.join(work_dir, dataset_name_or_path, split_name, f'{subset_name}.csv')
138
- if os.path.exists(file_path):
139
- data_dict[subset_name][split_name] = csv_to_list(file_path)
140
-
141
- return data_dict
142
-
143
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
144
- """
145
- Generate model prompt from raw input, unify the prompt format for CMMLU benchmark.
146
-
147
- Args:
148
- input_d (dict): The raw input. A single data format of the CMMLU:
149
-
150
- {'Question': '下列关于重力的说法正确的是',
151
- 'A': '在地球周围的物体都要受到重力作用,与其运动状态无关',
152
- 'B': '对某一物体而言,重力的大小是一个恒量,不随物体的地理位置而改变',
153
- 'C': '重力就是地球对物体的吸引力,重力的方向总是竖直向下',
154
- 'D': '在地球表面各处的重力方向都是相同的',
155
- 'Answer': 'A'}
156
-
157
- Returns:
158
- {'data': [(context, continuation), ...]}
159
-
160
- """
161
- few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
162
- context = '\n'.join(few_shot_prompts) + '\n'
163
- context += self._generate_prompt(input_d=input_d, include_answer=False)
164
-
165
- full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
166
-
167
- return self.gen_prompt_data(full_prompt)
168
-
169
- def get_gold_answer(self, input_d: dict) -> str:
170
- # Get the gold choice
171
- return input_d.get('Answer', '')
172
-
173
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
174
- """
175
- Parse the model output to get the answer. Could be the best choice index.
176
-
177
- Args:
178
- result: Predicted answer from the model. Usually a string for chat.
179
- raw_input_d: The raw input. Depending on the dataset.
180
- eval_type: The evaluation type. 'checkpoint', 'service', 'custom'.
181
-
182
- Returns:
183
- The parsed answer. Depending on the dataset. Usually a string for chat.
184
- """
185
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
186
- return result
187
- else:
188
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
189
-
190
- def match(self, gold: str, pred: str) -> float:
191
- return exact_match(gold=gold, pred=pred)
192
-
193
- def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
194
-
195
- input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
196
-
197
- example: str = input_d['Question']
198
- for j in range(len(self.choices)):
199
- example += '\n{}. {}'.format(self.choices[j], input_choices[j])
200
108
 
201
- example += '\nAnswer:'
202
- if include_answer:
203
- example += ' {}\n\n'.format(input_d['Answer'])
109
+ def record_to_sample(self, record) -> Sample:
204
110
 
205
- return example
111
+ # choices: ["(A) 农业生产工具","(B) 土地","(C) 劳动力","(D) 资金"]
112
+ # remove the leading (A), (B), (C), (D)
113
+ raw_choices = record['choices']
114
+ choice_list = [choice[3:].strip() for choice in raw_choices]
206
115
 
207
- @classmethod
208
- def _format_subject(cls, subject):
209
- l = subject.split('_')
210
- s = ''
211
- for entry in l:
212
- s += ' ' + entry
213
- return s
116
+ return Sample(
117
+ input=record['question'],
118
+ choices=choice_list,
119
+ target=record['answer'][1], # answer is like "A"
120
+ subset_key=record['category'],
121
+ metadata={'subject': record['category']},
122
+ )
@@ -1,125 +1,73 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI, Inc. and its affiliates.
3
- import glob
4
- import json
5
- import os
6
- from collections import defaultdict
7
3
 
8
- from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.metrics import extract_answer, math_equal, strip_answer_string
4
+ from typing import Any, Dict
5
+
6
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
10
11
  from evalscope.utils.logger import get_logger
11
12
 
12
13
  # flake8: noqa
13
14
 
14
15
  logger = get_logger()
15
16
 
16
-
17
- @Benchmark.register(
18
- name='competition_math',
19
- pretty_name='MATH',
20
- tags=['Mathematics'],
21
- description=
22
- 'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
23
- dataset_id='modelscope/competition_math',
24
- subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
25
- metric_list=['AveragePass@1'],
26
- few_shot_num=4,
27
- train_split=None,
28
- eval_split='test',
29
- prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
17
+ PROMPT_TEMPLATE = """
18
+ Problem:
19
+ {question}
20
+
21
+ Please reason step by step, and put your final answer within \\boxed{{}}.
22
+ """.lstrip()
23
+
24
+ FEWSHOT_TEMPLATE = """
25
+ Here are some examples of how to solve similar problems:
26
+
27
+ {fewshot}
28
+ """.lstrip() + PROMPT_TEMPLATE
29
+
30
+
31
+ @register_benchmark(
32
+ BenchmarkMeta(
33
+ name='competition_math',
34
+ pretty_name='MATH',
35
+ tags=[Tags.MATH, Tags.REASONING],
36
+ description=
37
+ 'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
38
+ dataset_id='evalscope/competition_math',
39
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
40
+ metric_list=[{
41
+ 'acc': {
42
+ 'numeric': True
43
+ }
44
+ }],
45
+ few_shot_num=4,
46
+ train_split='train',
47
+ eval_split='test',
48
+ prompt_template=PROMPT_TEMPLATE,
49
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
50
+ )
30
51
  )
31
- class CompetitionMathAdapter(DataAdapter):
32
- """ To be tested for all models. """
52
+ class CompetitionMathAdapter(DefaultDataAdapter):
33
53
 
34
54
  def __init__(self, **kwargs):
35
-
36
- few_shot_num = kwargs.get('few_shot_num', 4)
37
- if few_shot_num != 4 and few_shot_num != 0:
38
- logger.error(f'The MATH benchmark ONLY supports 4-shot by system or 0-shot settings, '
39
- f'but got {few_shot_num}. Use 4-shot by default.')
40
- kwargs['few_shot_num'] = 4
41
-
42
55
  super().__init__(**kwargs)
43
56
 
44
- def load(self, **kwargs):
45
- # default load all levels
46
- kwargs['subset_list'] = ['default']
47
- data_dict = super().load(**kwargs)
48
- return self.reformat_subset(data_dict, subset_key='level')
49
-
50
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
51
- data_dict = defaultdict(dict)
52
- for subset_name in subset_list:
53
- for split_name in [self.train_split, self.eval_split]:
54
- if os.path.exists(dataset_name_or_path):
55
- split_dir = os.path.join(dataset_name_or_path, split_name)
56
- else:
57
- split_dir = os.path.join(work_dir, dataset_name_or_path, split_name)
58
- split_files = glob.glob(os.path.join(split_dir, '**', '*.json'))
59
- split_data = []
60
- for file_path in split_files:
61
- if os.path.exists(file_path):
62
- with open(file_path, 'r', encoding='utf-8') as f:
63
- split_data.append(json.load(f))
64
- data_dict[subset_name][split_name] = split_data
65
-
66
- return data_dict
67
-
68
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
69
- """
70
- Generate the prompt for the model input.
71
-
72
- Args:
73
- input_d: raw input dict.
74
- {"problem": "How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?", "level": "Level 3", "type": "Algebra", "solution": "The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$. Therefore, the graph has $\\boxed{2}$ vertical asymptotes."}
75
-
76
- few_shot_list: few shot list. Each item is a raw input dict.
77
- **kwargs:
78
-
79
- Returns:
80
- {'data': [prompt]}
81
- """
82
- use_fewshot = self.few_shot_num > 0
83
- query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
84
- full_prompt = self.prompt_template.format(query=query)
85
- return self.gen_prompt_data(full_prompt)
86
-
87
- def get_gold_answer(self, input_d: dict) -> str:
88
- # Extract the gold answer from the input dict.
89
- return strip_answer_string(extract_answer(input_d['solution']))
90
-
91
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
92
- """
93
- Parse the model output to get the answer. Could be the best choice index.
94
-
95
- Args:
96
- result: Predicted answer from the model. Usually a string for chat.
97
- raw_input_d (dict): The raw input. Depending on the dataset.
98
- eval_type: 'checkpoint' or 'service' or `custom`
99
-
100
- Returns:
101
- The parsed answer. Depending on the dataset. Usually a string for chat.
102
- """
103
- # Note: Use same extraction method for both of checkpoint/service/custom
104
- result = strip_answer_string(extract_answer(result))
105
- return result
57
+ self.reformat_subset = True
106
58
 
107
- def match(self, gold: str, pred: str) -> float:
108
- res = math_equal(pred, gold)
109
- return 1.0 if res else 0.0
59
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
60
+ from evalscope.metrics.math_parser import extract_answer
110
61
 
111
- @classmethod
112
- def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
113
- problem: str = input_d['problem']
62
+ return Sample(
63
+ input=record['problem'],
64
+ target=extract_answer(record['solution']),
65
+ subset_key=record['level'],
66
+ metadata={
67
+ 'reasoning': record.get('solution', ''),
68
+ 'type': record.get('type', ''),
69
+ },
70
+ )
114
71
 
115
- if use_fewshot:
116
- # Use 4-shot examples by system
117
- context = (
118
- 'Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'
119
- 'Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:\nWe have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'
120
- 'Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'
121
- 'Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'
122
- f'Problem:\n{problem}\nSolution:\n')
123
- else:
124
- context = 'Problem:\n' + problem + '\nSolution:\n'
125
- return context
72
+ def sample_to_fewshot(self, sample: Sample) -> str:
73
+ return f'Problem:\n{sample.input}\nSolution:\n{sample.target}'