evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,28 +1,37 @@
1
- import math
1
+ import copy
2
2
  import os
3
- import re
4
- from typing import Any, Optional
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List
5
5
 
6
- from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, EvalType, HubType
8
- from evalscope.utils.io_utils import jsonl_to_list
6
+ from evalscope.api.benchmark import BenchmarkMeta, DataAdapter, DefaultDataAdapter
7
+ from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.metric.scorer import AggScore, SampleScore
10
+ from evalscope.api.registry import get_benchmark, register_benchmark
11
+ from evalscope.config import TaskConfig
12
+ from evalscope.constants import DataCollection, Tags
13
+ from evalscope.report.generator import ReportGenerator
14
+ from evalscope.report.report import Report
9
15
  from evalscope.utils.logger import get_logger
10
16
 
11
17
  logger = get_logger()
12
18
 
13
19
 
14
- @Benchmark.register(
15
- name='data_collection',
16
- dataset_id='', # dataset_id need to be set
17
- description='Data collection',
18
- subset_list=['default'],
19
- metric_list=['AverageAccuracy'],
20
- few_shot_num=0,
21
- train_split=None,
22
- eval_split='test',
23
- prompt_template='',
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name=DataCollection.NAME,
23
+ dataset_id='', # dataset_id need to be set
24
+ description='Custom Data collection, mixing multiple evaluation datasets for '
25
+ 'a unified evaluation, aiming to use less data to achieve a more comprehensive '
26
+ 'assessment of the model\'s capabilities. '
27
+ '[Usage Reference](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/collection/index.html)',
28
+ tags=[Tags.CUSTOM],
29
+ metric_list=['acc'],
30
+ eval_split='test',
31
+ prompt_template='',
32
+ )
24
33
  )
25
- class DataCollectionAdapter(DataAdapter):
34
+ class DataCollectionAdapter(DefaultDataAdapter):
26
35
 
27
36
  def __init__(self, **kwargs):
28
37
  """
@@ -30,43 +39,176 @@ class DataCollectionAdapter(DataAdapter):
30
39
  """
31
40
  super().__init__(**kwargs)
32
41
 
33
- def load(self,
34
- dataset_name_or_path: str = None,
35
- subset_list: list = None,
36
- work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
37
- datasets_hub: str = HubType.MODELSCOPE,
38
- **kwargs) -> dict:
39
- """
40
- Load the dataset. Remote and local datasets are supported.
41
- """
42
- dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
43
- subset_list = subset_list or self.subset_list
44
-
42
+ def load(self):
45
43
  # Try to load dataset from local disk
44
+ dataset_name_or_path = self.dataset_id
46
45
  if os.path.exists(dataset_name_or_path):
47
46
  logger.info(f'Loading dataset from {dataset_name_or_path}')
48
- dataset = jsonl_to_list(dataset_name_or_path)
49
- if len(dataset) == 0:
50
- raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
47
+ dataset_path = dataset_name_or_path
51
48
  else:
52
49
  from modelscope import dataset_snapshot_download
53
50
 
54
51
  # Load dataset from remote
55
- logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
52
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
53
+ # download dataset snapshot
54
+ dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern='*.jsonl')
55
+
56
+ dataset = LocalDataLoader(
57
+ data_id_or_path=dataset_path,
58
+ split=self.eval_split,
59
+ sample_fields=self.record_to_sample,
60
+ subset='test', # NOTE: using hardcoded test subset
61
+ limit=self.limit,
62
+ repeats=self.repeats,
63
+ shuffle=self.shuffle,
64
+ ).load()
65
+
66
+ test_dataset = DatasetDict({self.default_subset: dataset})
67
+
68
+ return test_dataset, None
69
+
70
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
71
+ """
72
+ Convert a data record to a Sample object. Every record is a DatasetEntry.
73
+
74
+ Args:
75
+ record (Dict[str, Any]): Input data record.
76
+
77
+ Returns:
78
+ Sample: Sample object with input, target, and metadata.
79
+ """
80
+ from evalscope.collections import DatasetEntry
81
+
82
+ entry = DatasetEntry.model_validate(record)
83
+ sample = Sample.model_validate(entry.prompt)
84
+
85
+ record_without_prompt = copy.deepcopy(record)
86
+ del record_without_prompt['prompt']
87
+ sample.metadata[DataCollection.INFO] = record_without_prompt # keep all metadata
88
+ return sample
89
+
90
+ def _post_process_samples(self):
91
+ """Post process of each sample"""
92
+ self._initialize_adapters()
93
+
94
+ def _initialize_adapters(self):
95
+ """Init adapters for each dataset and create dataset id map"""
96
+ self.dataset_adapters: Dict[str, DataAdapter] = {}
97
+ self.dataset_name_map = defaultdict(lambda: defaultdict(list))
98
+
99
+ # load dataset args
100
+ dataset_args = copy.deepcopy(self._task_config.dataset_args)
101
+
102
+ # Iterate through each sample in the dataset
103
+ dataset = self.test_dataset[self.default_subset]
104
+ for sample in dataset:
105
+ collection_info = sample.metadata.get(DataCollection.INFO, {})
106
+ dataset_name = collection_info.get('dataset_name', '')
107
+ subset_name = collection_info.get('subset_name', '')
108
+ # create id mapping
109
+ self.dataset_name_map[dataset_name][subset_name].append(sample.id)
110
+
111
+ # update dataset args
112
+ cur_dataset_args = dataset_args.get(dataset_name, {})
113
+
114
+ # Initialize dataset adapter
115
+ if dataset_name not in self.dataset_adapters:
116
+ config = TaskConfig(dataset_args={dataset_name: cur_dataset_args})
117
+ self.dataset_adapters[dataset_name] = get_benchmark(dataset_name, config=config)
118
+
119
+ def _get_adapter(self, metadata: Dict[str, Any]) -> DataAdapter:
120
+ collection_info = metadata.get(DataCollection.INFO, {})
121
+ dataset_name = collection_info.get('dataset_name', '')
122
+ return self.dataset_adapters.get(dataset_name)
123
+
124
+ def run_inference(self, model, sample, output_dir, **kwargs) -> TaskState:
125
+ data_adapter = self._get_adapter(sample.metadata)
126
+ if not data_adapter:
127
+ raise ValueError(f'No data adapter found for sample: {sample}')
128
+
129
+ return data_adapter.run_inference(model, sample, output_dir, **kwargs)
130
+
131
+ def calculate_metrics(self, task_state) -> SampleScore:
132
+ data_adapter = self._get_adapter(task_state.metadata)
133
+ if not data_adapter:
134
+ raise ValueError(f'No data adapter found for task state: {task_state}')
135
+
136
+ return data_adapter.calculate_metrics(task_state)
137
+
138
+ def aggregate_scores(self, sample_scores: List[SampleScore]):
139
+ import pandas as pd
140
+ from tabulate import tabulate
141
+
142
+ data = []
143
+ for sample_score in sample_scores:
144
+ collection_info = sample_score.sample_metadata[DataCollection.INFO]
145
+ main_score = sample_score.score.main_value
146
+ main_metric = sample_score.score.main_score_name
147
+
148
+ # use main score
149
+ data.append(
150
+ dict(
151
+ task_type=collection_info['task_type'],
152
+ categories=tuple(collection_info['categories']),
153
+ dataset_name=collection_info['dataset_name'],
154
+ subset_name=collection_info['subset_name'],
155
+ tags=collection_info['tags'],
156
+ sample_id=sample_score.sample_id,
157
+ metric=main_metric,
158
+ score=main_score
159
+ )
160
+ )
161
+
162
+ df = pd.DataFrame(data)
163
+
164
+ def aggregate_and_sort(df, group_by_cols):
165
+ # aggregate by group_by_cols, and calculate average_score and count
166
+ report_df = df.groupby(group_by_cols) \
167
+ .agg(average_score=('score', 'mean'), count=('score', 'size')) \
168
+ .reset_index()
169
+ report_df['average_score'] = report_df['average_score'].round(4)
170
+ report_df = report_df.sort_values(by='count', ascending=False) \
171
+ .to_dict(orient='records')
172
+ return report_df
173
+
174
+ # multi-level aggregation
175
+ subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
176
+ dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
177
+ task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
178
+
179
+ # explode tags to multiple rows
180
+ df_exploded_tags = df.explode('tags')
181
+ tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
56
182
 
57
- dataset_path = dataset_snapshot_download(
58
- dataset_name_or_path, cache_dir=work_dir, allow_file_pattern='*.jsonl')
59
- # find the jsonl file
60
- dataset_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.jsonl')]
61
- dataset = jsonl_to_list(dataset_files[0])
183
+ # process multi-level categories
184
+ df_categories = df.copy()
185
+ # multi-level aggregation for categories
186
+ max_depth = df_categories['categories'].apply(len).max()
187
+ for level in range(max_depth):
188
+ df_categories[f'category{level}'] = df_categories['categories'].apply(
189
+ lambda x: x[level] if len(x) > level else ''
190
+ )
191
+ category_report_df = aggregate_and_sort(
192
+ df_categories, [f'category{level}' for level in range(max_depth)] + ['metric']
193
+ )
62
194
 
63
- return dataset
195
+ # convert to dict format
196
+ report_dict = {
197
+ 'subset_level': subset_report_df,
198
+ 'dataset_level': dataset_report_df,
199
+ 'task_level': task_report_df,
200
+ 'tag_level': tag_report_df,
201
+ 'category_level': category_report_df,
202
+ }
64
203
 
65
- def get_gold_answer(self, input_d: Any) -> Any:
66
- return super().get_gold_answer(input_d)
204
+ # record report
205
+ for level, data in report_dict.items():
206
+ table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
207
+ logger.info(f'{level} Report:\n{table}')
67
208
 
68
- def match(self, gold: Any, pred: Any) -> Any:
69
- return super().match(gold, pred)
209
+ return df
70
210
 
71
- def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
72
- return super().parse_pred_result(result, raw_input_d, eval_type)
211
+ def generate_report(self, scores, model_name, output_dir, **kwargs) -> Report:
212
+ df = scores[self.default_subset]
213
+ report = ReportGenerator.gen_collection_report(df, self.name, model_name)
214
+ return report
@@ -1,6 +1,14 @@
1
- from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import EvalType
3
- from evalscope.metrics import LLMJudge
1
+ from typing import Any, Dict
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.metric import Score
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
4
12
 
5
13
  TEMPLATE_0SHOT = """Please read the following text and answer the question below.
6
14
 
@@ -13,73 +21,123 @@ TEMPLATE_0SHOT = """Please read the following text and answer the question below
13
21
  Format your response as follows: "Therefore, the answer is (insert answer here)"."""
14
22
 
15
23
 
16
- @Benchmark.register(
17
- name='docmath',
18
- pretty_name='DocMath',
19
- tags=['Reasoning', 'Mathematics', 'Long Context'],
20
- description=
21
- 'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
22
- dataset_id='yale-nlp/DocMath-Eval',
23
- metric_list=['AverageAccuracy'],
24
- subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
25
- few_shot_num=0,
26
- train_split=None,
27
- eval_split='test',
28
- prompt_template=TEMPLATE_0SHOT,
24
+ @register_benchmark(
25
+ BenchmarkMeta(
26
+ name='docmath',
27
+ pretty_name='DocMath',
28
+ tags=[Tags.REASONING, Tags.MATH, Tags.LONG_CONTEXT],
29
+ description=
30
+ 'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
31
+ dataset_id='yale-nlp/DocMath-Eval',
32
+ metric_list=['acc'],
33
+ subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
34
+ eval_split='test',
35
+ prompt_template=TEMPLATE_0SHOT,
36
+ )
29
37
  )
30
- class DocMathAdapter(DataAdapter):
38
+ class DocMathAdapter(DefaultDataAdapter):
31
39
 
32
40
  def __init__(self, **kwargs):
33
41
  super().__init__(**kwargs)
42
+ self._use_llm_judge = True # Enable LLM judge for DocMath
43
+ self.split_as_subset = True # Use split as subset for DocMath
34
44
 
35
- def load(self, **kwargs):
36
- # default load mini test
37
- kwargs['split_as_subset'] = True
38
- data_dict = super().load(**kwargs)
39
- return data_dict
40
-
41
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
42
- """
43
- Generate model prompt from input data.
45
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
44
46
  """
45
- context = context = '\n'.join(input_d['paragraphs'])
46
- question = input_d['question']
47
- prompt = self.prompt_template.format(context=context, question=question)
48
- return self.gen_prompt_data(prompt)
47
+ Convert a data record to a Sample object.
49
48
 
50
- def get_gold_answer(self, input_d: dict) -> str:
51
- """
52
- Parse the raw input labels (gold).
53
- """
54
- return input_d['ground_truth']
49
+ Args:
50
+ record (Dict[str, Any]): Input data record.
55
51
 
56
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
52
+ Returns:
53
+ Sample: Sample object with input, target, and metadata.
54
+ """
55
+ ground_truth = record['ground_truth']
56
+
57
+ return Sample(
58
+ input=record['question'],
59
+ target=str(ground_truth),
60
+ metadata={
61
+ 'question_id': record.get('question_id', ''),
62
+ 'paragraphs': record['paragraphs'],
63
+ 'answer_type': type(ground_truth).__name__
64
+ }
65
+ )
66
+
67
+ def format_prompt_template(self, sample):
68
+ context = '\n'.join(sample.metadata['paragraphs'])
69
+ question = sample.input
70
+ return self.prompt_template.format(context=context, question=question)
71
+
72
+ def extract_answer(self, prediction: str, task_state: TaskState):
57
73
  """
58
- Parse the predicted result and extract proper answer.
74
+ Extract the answer from the model prediction.
59
75
  """
60
76
  from .utils import extract_answer
61
77
 
62
- extracted_answer = extract_answer(result)
78
+ extracted_answer = extract_answer(prediction)
63
79
  return extracted_answer
64
80
 
65
- def match(self, gold: str, pred: str) -> float:
81
+ def match_score(
82
+ self,
83
+ original_prediction: str,
84
+ filtered_prediction: str,
85
+ reference: str,
86
+ task_state: TaskState,
87
+ ) -> Score:
66
88
  """
67
- Match the gold answer and the predicted answer.
89
+ Calculate accuracy score by matching prediction with reference.
68
90
  """
69
91
  from .utils import get_acc
70
92
 
71
- return get_acc(prediction=pred, gt=gold)
72
-
73
- def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
93
+ score = Score(
94
+ extracted_prediction=filtered_prediction,
95
+ prediction=original_prediction,
96
+ )
97
+
98
+ answer_type = task_state.metadata.get('answer_type', 'unknown')
99
+ accuracy = get_acc(prediction=filtered_prediction, gt=reference, answer_type=answer_type)
100
+ score.value = {'acc': accuracy}
101
+ score.main_score_name = 'acc'
102
+
103
+ return score
104
+
105
+ def llm_match_score(
106
+ self,
107
+ original_prediction: str,
108
+ filtered_prediction: str,
109
+ reference: str,
110
+ task_state: TaskState,
111
+ ) -> Score:
112
+ """
113
+ Use LLM judge to evaluate the prediction against the reference.
114
+ """
74
115
  from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
75
116
 
76
- raw_input = kwargs.get('raw_input', None)
77
- question = raw_input['question']
78
- # get grading response
79
- prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
80
- orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
81
- # parse grading response
117
+ score = Score(
118
+ extracted_prediction=filtered_prediction,
119
+ prediction=original_prediction,
120
+ )
121
+
122
+ question = task_state.metadata.get('question', '')
123
+
124
+ # Get grading response
125
+ prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=reference, answer_2=filtered_prediction)
126
+ orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
127
+
128
+ # Parse grading response
82
129
  if 'YES' in orm_response:
83
- return 1.0
130
+ accuracy = 1.0
84
131
  else:
85
- return 0.0
132
+ accuracy = 0.0
133
+
134
+ score.value = {'acc': accuracy}
135
+ score.explanation = f'LLM judge: {orm_response}'
136
+ score.metadata = {
137
+ 'source': 'llm_judge',
138
+ 'judge_strategy': self.judge_strategy,
139
+ 'model': self.llm_judge.model_id
140
+ }
141
+ score.main_score_name = 'acc'
142
+
143
+ return score
@@ -193,23 +193,22 @@ def compare_two_numbers(p, gt):
193
193
  return within_eps(pred=p, gt=gt)
194
194
 
195
195
 
196
- def get_acc(prediction, gt, cot=True):
196
+ def get_acc(prediction, gt, answer_type, cot=True):
197
197
  try:
198
198
  if cot:
199
199
  prediction = normalize(prediction)
200
200
  else:
201
201
  prediction = float(prediction)
202
202
 
203
- answer_type = type(gt).__name__
204
203
  assert answer_type in ['int', 'float', 'float64', 'bool'], answer_type
205
204
  if isinstance(prediction, (str, int, float, bool)) or isinstance(prediction, list):
206
205
  # Comparing prediction against the reference
207
206
  if answer_type in ['bool']:
208
- acc = int(prediction == gt)
207
+ acc = int(prediction == bool(gt))
209
208
  elif answer_type == 'int':
210
- acc = int(compare_two_numbers(prediction, gt))
209
+ acc = int(compare_two_numbers(prediction, int(gt)))
211
210
  elif answer_type == 'float' or answer_type == 'float64':
212
- acc = int(compare_two_numbers(prediction, gt))
211
+ acc = int(compare_two_numbers(prediction, float(gt)))
213
212
  else:
214
213
  acc = 0
215
214
  else:
@@ -1,8 +1,13 @@
1
+ import ast
1
2
  import re
2
- from typing import List
3
-
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType
3
+ from typing import Any, Dict, List
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
6
11
  from evalscope.utils.logger import get_logger
7
12
 
8
13
  logger = get_logger()
@@ -28,54 +33,82 @@ Answer: 43
28
33
  ''' # noqa: E501
29
34
 
30
35
 
31
- @Benchmark.register(
32
- name='drop',
33
- pretty_name='DROP',
34
- tags=['Reasoning'],
35
- description=
36
- 'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
37
- dataset_id='AI-ModelScope/DROP',
38
- metric_list=['AverageAccuracy'],
39
- few_shot_num=0,
40
- train_split=None,
41
- eval_split='validation',
42
- prompt_template=
43
- 'You will be asked to read a passage and answer a question.{drop_examples}# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.', # noqa: E501
36
+ @register_benchmark(
37
+ BenchmarkMeta(
38
+ name='drop',
39
+ pretty_name='DROP',
40
+ tags=[Tags.REASONING],
41
+ description=
42
+ 'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
43
+ dataset_id='AI-ModelScope/DROP',
44
+ metric_list=['acc'],
45
+ few_shot_num=3,
46
+ train_split=None,
47
+ eval_split='validation',
48
+ prompt_template=
49
+ 'You will be asked to read a passage and answer a question. {drop_examples}\n# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.', # noqa: E501
50
+ )
44
51
  )
45
- class DROPAdapter(DataAdapter):
52
+ class DROPAdapter(DefaultDataAdapter):
46
53
 
47
54
  def __init__(self, **kwargs):
48
55
  super().__init__(**kwargs)
49
56
 
50
- few_shot_num = kwargs.get('few_shot_num', 0)
51
- if few_shot_num != 0:
57
+ if self.few_shot_num != 0:
52
58
  self.few_shot_num = 3
53
59
  logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
54
60
  else:
55
61
  self.few_shot_num = 0
56
62
 
57
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
63
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
58
64
  """
59
- Generate model prompt from input data.
65
+ Convert a data record to a Sample object.
66
+
67
+ Args:
68
+ record (Dict[str, Any]): Input data record.
69
+
70
+ Returns:
71
+ Sample: Sample object with input, target, and metadata.
60
72
  """
61
- drop_examples = '' if self.few_shot_num == 0 else DROP_EXAMPLES
62
- query = f"Passage: {input_d['passage']}\nQuestion: {input_d['question']}"
63
- prompt = self.prompt_template.format(
73
+
74
+ # Parse gold answers
75
+ gold_answers = self._get_gold_answers(record)
76
+
77
+ return Sample(
78
+ input=record['question'],
79
+ target=str(gold_answers),
80
+ metadata={
81
+ 'passage': record['passage'],
82
+ 'answer': record['answer'],
83
+ 'validated_answers': record['validated_answers']
84
+ }
85
+ )
86
+
87
+ def format_prompt_template(self, sample: Sample) -> str:
88
+ drop_examples = ''
89
+ query = f"Passage: {sample.metadata['passage']}\nQuestion: {sample.input}"
90
+
91
+ return self.prompt_template.format(
64
92
  drop_examples=drop_examples,
65
93
  query=query,
66
94
  )
67
- return self.gen_prompt_data(prompt)
68
95
 
69
- def get_gold_answer(self, input_d: dict) -> List[str]:
96
+ def format_fewshot_template(self, fewshot, sample):
97
+ drop_examples = DROP_EXAMPLES
98
+ query = f"Passage: {sample.metadata['passage']}\nQuestion: {sample.input}"
99
+
100
+ return self.prompt_template.format(
101
+ drop_examples=drop_examples,
102
+ query=query,
103
+ )
104
+
105
+ def _get_gold_answers(self, input_d: dict) -> List[str]:
70
106
  """
71
107
  Parse the raw input labels (gold).
72
108
  """
73
109
 
74
110
  def _flatten_validated_answers(validated_answers):
75
- """Flattens a dict of lists of validated answers.
76
- {"number": ['1', '8'], ...}
77
- -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
78
- """
111
+ """Flattens a dict of lists of validated answers."""
79
112
  valid_answers = []
80
113
  for i in range(len(validated_answers['number'])):
81
114
  valid_answers.append({
@@ -96,24 +129,36 @@ class DROPAdapter(DataAdapter):
96
129
  answers.append(answer)
97
130
  return answers
98
131
 
99
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
132
+ def extract_answer(self, prediction: str, task_state: TaskState):
100
133
  """
101
- Parse the predicted result and extract proper answer.
134
+ Extract the answer from the model prediction.
102
135
  """
103
- match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', result)
104
- extracted_answer = match.group(1) if match else result
136
+ match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', prediction)
137
+ extracted_answer = match.group(1) if match else prediction
105
138
  return extracted_answer
106
139
 
107
- def match(self, gold: List[str], pred: str) -> float:
140
+ def match_score(
141
+ self,
142
+ original_prediction: str,
143
+ filtered_prediction: str,
144
+ reference: str,
145
+ task_state: TaskState,
146
+ ) -> Score:
108
147
  """
109
- Match the gold answer and the predicted answer.
148
+ Calculate accuracy score by matching prediction with reference answers.
110
149
  """
111
150
  from .utils import _answer_to_bags
112
151
 
152
+ score = Score(
153
+ extracted_prediction=filtered_prediction,
154
+ prediction=original_prediction,
155
+ )
156
+
113
157
  max_em = 0
114
- for gold_answer in gold:
158
+ reference = ast.literal_eval(reference) if isinstance(reference, str) else reference
159
+ for gold_answer in reference:
115
160
  # Convert the answers to bags of answers
116
- predicted_bags = _answer_to_bags(pred)
161
+ predicted_bags = _answer_to_bags(filtered_prediction)
117
162
  gold_bags = _answer_to_bags(gold_answer)
118
163
 
119
164
  if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
@@ -124,7 +169,10 @@ class DROPAdapter(DataAdapter):
124
169
  if gold_answer[0].strip():
125
170
  max_em = max(max_em, exact_match)
126
171
 
127
- return max_em
172
+ score.value = {'acc': max_em}
173
+ score.main_score_name = 'acc'
174
+
175
+ return score
128
176
 
129
177
  @staticmethod
130
178
  def parse_answer(answer):