evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,12 @@
1
- from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.metrics import extract_answer, math_equal, strip_answer_string
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from typing import Any, Dict
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
3
10
  from evalscope.utils.logger import get_logger
4
11
 
5
12
  # flake8: noqa
@@ -7,46 +14,37 @@ from evalscope.utils.logger import get_logger
7
14
  logger = get_logger()
8
15
 
9
16
 
10
- @Benchmark.register(
11
- name='aime24',
12
- pretty_name='AIME-2024',
13
- tags=['Mathematics'],
14
- description=
15
- 'The AIME 2024 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.', # noqa: E501
16
- dataset_id='HuggingFaceH4/aime_2024',
17
- subset_list=['default'],
18
- metric_list=['AveragePass@1'],
19
- few_shot_num=0,
20
- train_split=None,
21
- eval_split='train', # Only train set is available
22
- prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
17
+ @register_benchmark(
18
+ BenchmarkMeta(
19
+ name='aime24',
20
+ pretty_name='AIME-2024',
21
+ tags=[Tags.MATH, Tags.REASONING],
22
+ description=
23
+ 'The AIME 2024 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model\'s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.', # noqa: E501
24
+ dataset_id='HuggingFaceH4/aime_2024',
25
+ subset_list=['default'],
26
+ metric_list=[{
27
+ 'acc': {
28
+ 'numeric': True
29
+ }
30
+ }],
31
+ few_shot_num=0,
32
+ train_split=None,
33
+ eval_split='train', # Only train set is available
34
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
35
+ )
23
36
  )
24
- class AIME24Adapter(DataAdapter):
37
+ class AIME24Adapter(DefaultDataAdapter):
25
38
 
26
39
  def __init__(self, *args, **kwargs):
27
40
  super().__init__(*args, **kwargs)
28
41
 
29
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
30
- """
31
- Generate the prompt for the model input.
32
- """
33
- problem = input_d['problem']
34
- full_prompt = self.prompt_template.format(query=problem)
35
-
36
- return self.gen_prompt_data(full_prompt)
37
-
38
- def get_gold_answer(self, input_d: dict) -> str:
39
- # Extract the gold answer from the input dict.
40
- return strip_answer_string(input_d['answer'])
41
-
42
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
43
- """
44
- Parse the model output to get the answer. Could be the best choice index.
45
- """
46
- # Note: Use same extraction method for both of checkpoint/service/custom
47
- result = strip_answer_string(extract_answer(result))
48
- return result
49
-
50
- def match(self, gold: str, pred: str) -> float:
51
- res = math_equal(pred, gold)
52
- return 1.0 if res else 0.0
42
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
43
+ return Sample(
44
+ input=record['problem'],
45
+ target=record['answer'],
46
+ metadata={
47
+ 'problem_id': record.get('id', ''),
48
+ 'solution': record.get('solution', ''),
49
+ },
50
+ )
@@ -1,5 +1,12 @@
1
- from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.metrics import extract_answer, math_equal, strip_answer_string
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from typing import Any, Dict
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
3
10
  from evalscope.utils.logger import get_logger
4
11
 
5
12
  # flake8: noqa
@@ -7,46 +14,33 @@ from evalscope.utils.logger import get_logger
7
14
  logger = get_logger()
8
15
 
9
16
 
10
- @Benchmark.register(
11
- name='aime25',
12
- pretty_name='AIME-2025',
13
- tags=['Mathematics'],
14
- description=
15
- 'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
16
- dataset_id='opencompass/AIME2025',
17
- subset_list=['AIME2025-I', 'AIME2025-II'],
18
- metric_list=['AveragePass@1'],
19
- few_shot_num=0,
20
- train_split=None,
21
- eval_split='test', # Only train set is available
22
- prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
17
+ @register_benchmark(
18
+ BenchmarkMeta(
19
+ name='aime25',
20
+ pretty_name='AIME-2025',
21
+ tags=[Tags.MATH, Tags.REASONING],
22
+ description=
23
+ 'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model\'s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
24
+ dataset_id='opencompass/AIME2025',
25
+ subset_list=['AIME2025-I', 'AIME2025-II'],
26
+ metric_list=[{
27
+ 'acc': {
28
+ 'numeric': True
29
+ }
30
+ }],
31
+ few_shot_num=0,
32
+ train_split=None,
33
+ eval_split='test',
34
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
35
+ )
23
36
  )
24
- class AIME25Adapter(DataAdapter):
37
+ class AIME25Adapter(DefaultDataAdapter):
25
38
 
26
39
  def __init__(self, *args, **kwargs):
27
40
  super().__init__(*args, **kwargs)
28
41
 
29
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
30
- """
31
- Generate the prompt for the model input.
32
- """
33
- problem = input_d['question']
34
- full_prompt = self.prompt_template.format(query=problem)
35
-
36
- return self.gen_prompt_data(full_prompt)
37
-
38
- def get_gold_answer(self, input_d: dict) -> str:
39
- # Extract the gold answer from the input dict.
40
- return strip_answer_string(input_d['answer'])
41
-
42
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
43
- """
44
- Parse the model output to get the answer. Could be the best choice index.
45
- """
46
- # Note: Use same extraction method for both of checkpoint/service/custom
47
- result = strip_answer_string(extract_answer(result))
48
- return result
49
-
50
- def match(self, gold: str, pred: str) -> float:
51
- res = math_equal(pred, gold)
52
- return 1.0 if res else 0.0
42
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
43
+ return Sample(
44
+ input=record['question'],
45
+ target=record['answer'],
46
+ )
@@ -1,16 +1,17 @@
1
1
  import re
2
- from collections import defaultdict
3
- from typing import Any, List
4
-
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
7
10
  from evalscope.utils.logger import get_logger
8
11
 
9
- # flake8: noqa
10
-
11
12
  logger = get_logger()
12
13
 
13
- GRADER_SYSTEM_PROMPT = """You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers."""
14
+ GRADER_SYSTEM_PROMPT = """You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.""" # noqa: E501
14
15
 
15
16
  GRADER_TEMPLATE = """
16
17
  I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
@@ -44,64 +45,89 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
44
45
  """.strip() # noqa: E501
45
46
 
46
47
 
47
- @Benchmark.register(
48
- name='alpaca_eval',
49
- pretty_name='AlpacaEval2.0',
50
- tags=['Instruction-Following', 'Arena'],
51
- description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
52
- 'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
53
- 'provide more accurate and cost-effective model assessments. '
54
- 'Currently not support `length-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-turbo`.', # noqa: E501
55
- dataset_id='AI-ModelScope/alpaca_eval',
56
- subset_list=['alpaca_eval_gpt4_baseline'],
57
- metric_list=['winrate'],
58
- few_shot_num=0,
59
- train_split=None,
60
- eval_split='eval')
61
- class AlpacaEvalAdapter(DataAdapter):
48
+ @register_benchmark(
49
+ BenchmarkMeta(
50
+ name='alpaca_eval',
51
+ pretty_name='AlpacaEval2.0',
52
+ tags=[Tags.INSTRUCTION_FOLLOWING, Tags.ARENA],
53
+ description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
54
+ 'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
55
+ 'provide more accurate and cost-effective model assessments. '
56
+ 'Currently not support `length-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-turbo`.', # noqa: E501
57
+ dataset_id='AI-ModelScope/alpaca_eval',
58
+ subset_list=['alpaca_eval_gpt4_baseline'],
59
+ metric_list=['winrate'],
60
+ few_shot_num=0,
61
+ train_split=None,
62
+ eval_split='eval',
63
+ prompt_template='{question}'
64
+ )
65
+ )
66
+ class AlpacaEvalAdapter(DefaultDataAdapter):
62
67
 
63
68
  def __init__(self, *args, **kwargs):
64
69
  super().__init__(*args, **kwargs)
65
70
 
66
- # register metrics
67
- metric_registry.register(Metric(name='winrate', object=mean))
68
-
69
- # whether to use LLM as a judge
70
- self.llm_as_a_judge = True
71
-
72
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
73
- question = input_d['instruction']
74
- return self.gen_prompt_data(question)
75
-
76
- def get_gold_answer(self, input_d: dict) -> str:
77
- return input_d['output']
71
+ self._use_llm_judge = True # Use LLM as a judge by default
72
+
73
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
74
+ """
75
+ Convert a data record to a Sample object.
76
+
77
+ Args:
78
+ record (Dict[str, Any]): Input data record.
79
+
80
+ Returns:
81
+ Sample: Sample object with input, target, and metadata.
82
+ """
83
+ instruction = record['instruction']
84
+ baseline_output = record['output'] # baseline model output
85
+
86
+ return Sample(
87
+ input=instruction,
88
+ target=baseline_output,
89
+ metadata={
90
+ 'generator': record.get('generator', 'unknown'),
91
+ 'dataset': record.get('dataset', 'unknown')
92
+ }
93
+ )
94
+
95
+ def llm_match_score(
96
+ self,
97
+ original_prediction: str,
98
+ filtered_prediction: str,
99
+ reference: str,
100
+ task_state: TaskState,
101
+ ) -> Score:
102
+ score = Score(
103
+ extracted_prediction=filtered_prediction,
104
+ prediction=original_prediction,
105
+ )
106
+
107
+ instruction = task_state.input_text
108
+
109
+ # Request judge and obtain score
110
+ # reference is baseline answer 'm', filtered_prediction is model answer 'M'
111
+ prompt = GRADER_TEMPLATE.format(instruction=instruction, output_1=reference, output_2=filtered_prediction)
112
+ judge_response = self.llm_judge.judge(prompt, system_prompt=GRADER_SYSTEM_PROMPT)
78
113
 
79
- def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
80
- return result.strip()
81
-
82
- def match(self, gold: str, pred: str):
83
- # simple match
84
- logger.warning(f'Please use LLMJudge to match the result for {self.name}')
85
- return None
86
-
87
- def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> bool:
88
- raw_input = kwargs.get('raw_input', None)
89
- instruction = raw_input['instruction']
90
- # gold is baseline answer 'm', pred is model answer 'M'
91
- prompt = GRADER_TEMPLATE.format(instruction=instruction, output_1=gold, output_2=pred)
92
- # get grading response
93
- grading_response = judge(prompt, system_prompt=GRADER_SYSTEM_PROMPT)
94
114
  # parse grading response
95
- match = re.search(r'(m|M)', grading_response)
115
+ match = re.search(r'(m|M)', judge_response)
96
116
  res = match.group(0) if match else None
117
+
97
118
  if res:
98
- return res == 'M'
119
+ winrate = 1 if res == 'M' else 0
99
120
  else:
100
- logger.info(f'Failed to parse grading response: {prompt=}\n {grading_response=}')
101
- return None
102
-
103
- def compute_metric(self, review_res_list: List[bool], **kwargs) -> List[dict]:
104
- # zip dict answers
105
- res_list = [res for res in review_res_list if res is not None]
106
-
107
- return super().compute_metric(res_list, **kwargs)
121
+ logger.info(f'Failed to parse grading response: {prompt=}\n {judge_response=}')
122
+ winrate = 0
123
+
124
+ # Set score based on the match result
125
+ score.value = {'winrate': winrate}
126
+ score.explanation = f'LLM judge: {judge_response}'
127
+ score.metadata = {
128
+ 'source': 'llm_judge',
129
+ 'judge_strategy': self.judge_strategy,
130
+ 'model': self.llm_judge.model_id
131
+ }
132
+ score.main_score_name = 'winrate'
133
+ return score
@@ -1,159 +1,46 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import json
4
- import os
5
-
6
- from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType, OutputType
8
- from evalscope.metrics import exact_match
9
- from evalscope.metrics.completion_parsers import ResponseParser
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
10
7
  from evalscope.utils.logger import get_logger
11
-
12
- # flake8: noqa
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
13
9
 
14
10
  logger = get_logger()
15
11
 
16
12
 
17
- @Benchmark.register(
18
- name='arc',
19
- pretty_name='ARC',
20
- tags=['Reasoning', 'MCQ'],
21
- description=
22
- 'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.', # noqa: E501
23
- dataset_id='modelscope/ai2_arc',
24
- model_adapter=OutputType.GENERATION,
25
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
26
- subset_list=['ARC-Easy', 'ARC-Challenge'],
27
- metric_list=['AverageAccuracy'],
28
- few_shot_num=0,
29
- train_split='train',
30
- eval_split='test',
31
- prompt_template=
32
- 'Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n{query}\nYour response should end with "The best answer is [the_answer_letter]" where the [the_answer_letter] is one of A, B, C or D.', # noqa
13
+ @register_benchmark(
14
+ BenchmarkMeta(
15
+ name='arc',
16
+ pretty_name='ARC',
17
+ tags=[Tags.REASONING, Tags.MULTIPLE_CHOICE],
18
+ description=
19
+ 'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.', # noqa: E501
20
+ dataset_id='allenai/ai2_arc',
21
+ subset_list=['ARC-Easy', 'ARC-Challenge'],
22
+ metric_list=['acc'],
23
+ few_shot_num=0,
24
+ train_split='train',
25
+ eval_split='test',
26
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
27
+ )
33
28
  )
34
- class ARCAdapter(DataAdapter):
29
+ class ARCAdapter(MultiChoiceAdapter):
35
30
 
36
31
  def __init__(self, **kwargs):
37
- few_shot_num = kwargs.get('few_shot_num', None)
38
- if few_shot_num is None:
39
- # Use 0-shot by default
40
- logger.info(f'Set 0-shot examples by system for ARC.')
41
- few_shot_num = 0
42
-
43
- if few_shot_num != 0:
44
- logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
45
-
46
32
  super().__init__(**kwargs)
47
33
 
48
- self.choices = ['A', 'B', 'C', 'D']
49
-
50
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
51
- """
52
- Load the dataset from local disk.
53
-
54
- dataset_name_or_path: str, the dataset id or path. e.g. 'arc'
55
- subset_list: list, the subset list to load. e.g. ['ARC-Easy', 'ARC-Challenge']
56
- work_dir: str, the local root data directory. e.g. '/path/to/data'
57
- kwargs: dict, other arguments.
58
- """
59
- data_dict = {}
60
- for subset_name in subset_list:
61
- if os.path.exists(dataset_name_or_path):
62
- subset_path = os.path.join(dataset_name_or_path, subset_name)
63
- else:
64
- subset_path = os.path.join(work_dir, dataset_name_or_path, subset_name)
65
- for split_name in ['Train', 'Test']:
66
- split_path = os.path.join(subset_path, f'{subset_name}-{split_name}.jsonl')
67
- if os.path.exists(split_path):
68
- with open(split_path, 'r', errors='ignore', encoding='utf-8') as in_f:
69
- rows = []
70
- for line in in_f:
71
- item = json.loads(line.strip())
72
- raw_choices = item['question']['choices']
73
- rows.append({
74
- 'id': item['id'],
75
- 'question': item['question']['stem'],
76
- 'choices': {
77
- 'text': [d['text'] for d in raw_choices],
78
- 'label': [d['label'] for d in raw_choices]
79
- },
80
- 'answerKey': item['answerKey'],
81
- })
82
-
83
- if subset_name in data_dict:
84
- data_dict[subset_name].update({split_name.lower(): rows})
85
- else:
86
- data_dict[subset_name] = {split_name.lower(): rows}
87
-
88
- return data_dict
89
-
90
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
91
- """
92
- Generate model prompt from raw data, unify the prompt format for ARC benchmark.
93
-
94
- Args:
95
- input_d (dict): The raw input. A single data format of the ARC:
96
-
97
- {
98
- 'id': 'Mercury_7220990',
99
- 'question': 'Which factor will most likely cause a person to develop a fever?',
100
- 'choices':
101
- {
102
- 'text':['a leg muscle relaxing after exercise',
103
- 'a bacterial population in the bloodstream',
104
- 'several viral particles on the skin',
105
- 'carbohydrates being digested in the stomach'],
106
- 'label': ['A', 'B', 'C', 'D']
107
- },
108
- 'answerKey': 'B'
109
- }
110
-
111
- Returns:
112
- {'data': ['xxx'], 'multi_choices': ['A', 'B', 'C', 'D']}
113
- """
114
- few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
115
- context = '\n'.join(few_shot_prompts) + self._generate_prompt(input_d=input_d, include_answer=False)
116
-
117
- full_prompt = self.prompt_template.format(query=context)
118
-
119
- return self.gen_prompt_data(full_prompt)
120
-
121
- def get_gold_answer(self, input_d: dict) -> str:
122
- # Get the gold choice
123
- return input_d.get('answerKey', '')
124
-
125
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
126
- """
127
- Parse the model output to get the answer. Could be the best choice index.
128
-
129
- Args:
130
- result: Predicted answer from the model. Usually a string for chat.
131
- raw_input_d (dict): The raw input. Depending on the dataset.
132
- eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
133
-
134
- Returns:
135
- The parsed answer. Depending on the dataset. Usually a string for chat.
136
- """
137
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
138
- return result
139
- else:
140
- return ResponseParser.parse_first_option(text=result, options=self.choices)
141
-
142
- def match(self, gold: str, pred: str) -> float:
143
- return exact_match(gold=gold, pred=pred)
144
-
145
- @classmethod
146
- def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
147
-
148
- example: str = input_d['question']
149
-
150
- choices_texts: list = input_d['choices']['text']
151
- choices_labels: list = input_d['choices']['label']
152
- choices_prompts: str = '\n'.join([label + '. ' + text for text, label in zip(choices_texts, choices_labels)])
153
- example += '\n' + choices_prompts
154
-
155
- if include_answer:
156
- example += '\nAnswer:'
157
- example += ' {}\n\n'.format(input_d['answerKey'])
158
-
159
- return example
34
+ def record_to_sample(self, record) -> Sample:
35
+ # Convert choice labels to indices (A->0, B->1, etc.)
36
+ choice_texts = record['choices']['text']
37
+ answer_key = record['answerKey']
38
+
39
+ return Sample(
40
+ input=record['question'],
41
+ choices=choice_texts,
42
+ target=answer_key,
43
+ metadata={
44
+ 'id': record.get('id', ''),
45
+ },
46
+ )