evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,14 @@
1
- from collections import defaultdict
2
1
  from typing import Any, Dict
3
2
 
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType, OutputType
6
- from evalscope.metrics import exact_match
7
- from evalscope.metrics.completion_parsers import ResponseParser
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
8
7
  from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
9
9
 
10
10
  logger = get_logger()
11
11
 
12
- SUBSET_LIST = [
13
- 'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
14
- 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
15
- 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
16
- 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
17
- 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
18
- 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
19
- 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
20
- 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
21
- 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
22
- 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
23
- 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
24
- 'world_religions'
25
- ]
26
-
27
12
  SUBJECT_MAPPING = {
28
13
  'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
29
14
  'anatomy': ['Anatomy', 'health', 'Other'],
@@ -84,25 +69,31 @@ SUBJECT_MAPPING = {
84
69
  'world_religions': ['World Religions', 'philosophy', 'Humanities'],
85
70
  }
86
71
 
87
-
88
- @Benchmark.register(
89
- name='mmlu_redux',
90
- pretty_name='MMLU-Redux',
91
- tags=['MCQ', 'Knowledge'],
92
- description=
93
- 'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.', # noqa: E501
94
- dataset_id='AI-ModelScope/mmlu-redux-2.0',
95
- model_adapter=OutputType.GENERATION,
96
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
97
- subset_list=SUBSET_LIST,
98
- metric_list=['AverageAccuracy'],
99
- few_shot_num=0,
100
- train_split=None,
101
- eval_split='test',
102
- prompt_template=
103
- 'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
72
+ SUBSET_LIST = list(SUBJECT_MAPPING.keys())
73
+
74
+
75
+ @register_benchmark(
76
+ BenchmarkMeta(
77
+ name='mmlu_redux',
78
+ pretty_name='MMLU-Redux',
79
+ tags=[Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
80
+ description=
81
+ 'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options. ' # noqa: E501
82
+ 'The bad answers are corrected.', # noqa: E501
83
+ dataset_id='AI-ModelScope/mmlu-redux-2.0',
84
+ subset_list=SUBSET_LIST,
85
+ metric_list=[{
86
+ 'acc': {
87
+ 'allow_inclusion': True
88
+ }
89
+ }],
90
+ few_shot_num=0,
91
+ train_split=None,
92
+ eval_split='test',
93
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
94
+ )
104
95
  )
105
- class MMLUReduxAdapter(DataAdapter):
96
+ class MMLUReduxAdapter(MultiChoiceAdapter):
106
97
 
107
98
  def __init__(self, **kwargs):
108
99
  super().__init__(**kwargs)
@@ -111,75 +102,38 @@ class MMLUReduxAdapter(DataAdapter):
111
102
  self.few_shot_num = 0
112
103
  logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
113
104
 
114
- self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
115
- self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
116
-
117
- def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
118
- if self.few_shot_num > 0:
119
- prefix = self.format_fewshot_examples(few_shot_list)
120
- else:
121
- prefix = ''
122
- query = prefix + 'Q: ' + input_d['question'] + '\n' + \
123
- self.__form_options(input_d['choices']) + '\n'
124
-
125
- full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
126
- return self.gen_prompt_data(full_prompt)
127
-
128
- def format_fewshot_examples(self, few_shot_list):
129
- # load few-shot prompts for each category
130
- prompts = ''
131
- for index, d in enumerate(few_shot_list):
132
- prompts += 'Q: ' + d['question'] + '\n' + \
133
- self.__form_options(d['choices']) + '\n'
134
- return prompts
135
-
136
- def __form_options(self, options: list):
137
- option_str = 'Options are:\n'
138
- for opt, choice in zip(options, self.choices):
139
- option_str += f'({choice}): {opt}' + '\n'
140
- return option_str
141
-
142
- def get_gold_answer(self, input_d: dict) -> str:
143
- """
144
- Parse the raw input labels (gold).
145
-
146
- Args:
147
- input_d: input raw data. Depending on the dataset.
148
-
149
- Returns:
150
- The parsed input. e.g. gold answer ... Depending on the dataset.
151
- """
152
- answer_index = int(input_d['answer'])
153
- return self.choices[answer_index]
154
-
155
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
156
- """
157
- Parse the predicted result and extract proper answer.
158
-
159
- Args:
160
- result: Predicted answer from the model. Usually a string for chat.
161
- raw_input_d: The raw input. Depending on the dataset.
162
- eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
163
-
164
- Returns:
165
- The parsed answer. Depending on the dataset. Usually a string for chat.
166
- """
167
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
168
- return result
169
- else:
170
- return ResponseParser.parse_first_option(result, options=self.choices)
171
-
172
- def match(self, gold: str, pred: str) -> float:
173
- """
174
- Match the gold answer and the predicted answer.
175
-
176
- Args:
177
- gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
178
- e.g. 'A', extracted from get_gold_answer method.
179
- pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
180
- e.g. 'B', extracted from parse_pred_result method.
181
-
182
- Returns:
183
- The match result. Usually a score (float) for chat/multiple-choice-questions.
184
- """
185
- return exact_match(gold=gold, pred=pred)
105
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
106
+ error_type = record['error_type']
107
+ choices = record['choices']
108
+ target_index_list = [int(record['answer'])]
109
+ correct_answer = record['correct_answer']
110
+ if error_type == 'no_correct_answer' and correct_answer:
111
+ choices[target_index_list[0]] = correct_answer
112
+ elif error_type == 'wrong_groundtruth' and correct_answer:
113
+ try:
114
+ target_index_list = [int(correct_answer)]
115
+ except ValueError:
116
+ choice_index = ord(correct_answer) - ord('A')
117
+ target_index_list = [choice_index]
118
+ elif error_type == 'multiple_correct_answers' and correct_answer:
119
+ correct_answer = correct_answer.strip('()')
120
+ try:
121
+ correct_answer = correct_answer.replace(' and ', ',').replace(' or ', ',')
122
+ target_index_list = list(map(int, correct_answer.split(',')))
123
+ except ValueError:
124
+ try:
125
+ target_index_list = [ord(c) - ord('A') for c in correct_answer.split(',')]
126
+ except TypeError:
127
+ # find the index of the correct answer in choices
128
+ target_index_list = [choices.index(c) for c in correct_answer.split(',') if c in choices]
129
+
130
+ return Sample(
131
+ input=record['question'],
132
+ choices=choices,
133
+ target=['ABCD'[i] for i in target_index_list] if target_index_list else ['A', 'B', 'C', 'D'],
134
+ metadata={
135
+ 'error_type': error_type,
136
+ 'correct_answer': correct_answer,
137
+ 'potential_reason': record.get('potential_reason', ''),
138
+ },
139
+ )
@@ -1,74 +1,43 @@
1
1
  import ast
2
2
  from typing import Any
3
3
 
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType, OutputType
6
- from evalscope.metrics import exact_match
7
- from evalscope.metrics.completion_parsers import ResponseParser
8
-
9
-
10
- @Benchmark.register(
11
- name='musr',
12
- pretty_name='MuSR',
13
- tags=['Reasoning', 'MCQ'],
14
- description=
15
- 'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.', # noqa: E501
16
- dataset_id='AI-ModelScope/MuSR',
17
- model_adapter=OutputType.GENERATION,
18
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
19
- subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
20
- metric_list=['AverageAccuracy'],
21
- few_shot_num=0,
22
- train_split=None,
23
- eval_split='test',
24
- prompt_template=
25
- '{narrative}\n\n{question}\n\n{choices}\nThink step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.', # noqa: E501
4
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.registry import register_benchmark
7
+ from evalscope.constants import Tags
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
9
+
10
+
11
+ @register_benchmark(
12
+ BenchmarkMeta(
13
+ name='musr',
14
+ pretty_name='MuSR',
15
+ tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
16
+ description=
17
+ 'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.', # noqa: E501
18
+ dataset_id='AI-ModelScope/MuSR',
19
+ metric_list=['acc'],
20
+ subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
21
+ few_shot_num=0,
22
+ train_split=None,
23
+ eval_split='test',
24
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
25
+ )
26
26
  )
27
- class MuSRAdapter(DataAdapter):
27
+ class MuSRAdapter(MultiChoiceAdapter):
28
28
 
29
29
  def __init__(self, **kwargs):
30
30
  super().__init__(**kwargs)
31
31
 
32
- self.choices = ['A', 'B', 'C', 'D', 'E', 'F']
33
-
34
- def load(self, **kwargs):
35
- # default load all levels
36
- kwargs['split_as_subset'] = True
37
- data_dict = super().load(**kwargs)
38
- return data_dict
39
-
40
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
41
-
42
- choices = self.format_choice(ast.literal_eval(input_d['choices']))
43
-
44
- full_prompt = self.prompt_template.format(
45
- narrative=input_d['narrative'], question=input_d['question'], choices=choices)
46
-
47
- return self.gen_prompt_data(full_prompt)
48
-
49
- def format_choice(self, options: list):
50
- option_str = ''
51
- for opt, choice in zip(options, self.choices):
52
- option_str += f'({choice}): {opt}\n'
53
- return option_str
54
-
55
- def get_gold_answer(self, input_d: dict) -> str:
56
- """
57
- Parse the raw input labels (gold).
58
- """
59
- return self.choices[input_d['answer_index']]
32
+ self.split_as_subset = True
60
33
 
61
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
62
- """
63
- Parse the predicted result and extract proper answer.
64
- """
65
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
66
- return result
67
- else:
68
- return ResponseParser.parse_first_option(result, options=self.choices)
34
+ def record_to_sample(self, record) -> Sample:
35
+ choices = ast.literal_eval(record['choices'])
36
+ choice_letters = ['A', 'B', 'C', 'D', 'E', 'F']
37
+ target_letter = choice_letters[record['answer_index']]
69
38
 
70
- def match(self, gold: str, pred: str) -> float:
71
- """
72
- Match the gold answer and the predicted answer.
73
- """
74
- return exact_match(gold=gold, pred=pred)
39
+ return Sample(
40
+ input=f"{record['narrative']}\n\n{record['question']}",
41
+ choices=choices,
42
+ target=target_letter,
43
+ )