evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,155 +1,94 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.metrics import mean
8
- from evalscope.utils.io_utils import jsonl_to_list
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageSystem, ChatMessageUser, dict_to_chat_message
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
9
11
  from evalscope.utils.logger import get_logger
10
12
 
11
13
  logger = get_logger()
12
14
 
13
-
14
- @Benchmark.register(
15
- name='general_qa',
16
- pretty_name='General-QA',
17
- description='A general question answering dataset for custom evaluation. '
18
- 'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
19
- tags=['QA', 'Custom'],
20
- dataset_id='general_qa',
21
- subset_list=['default'],
22
- metric_list=['AverageBLEU', 'AverageRouge'],
23
- few_shot_num=0,
24
- train_split=None,
25
- eval_split='test',
26
- prompt_template='请回答问题\n{query}',
15
+ PROMPT_TEMPLATE = '请回答问题\n{question}'
16
+
17
+
18
+ @register_benchmark(
19
+ BenchmarkMeta(
20
+ name='general_qa',
21
+ pretty_name='General-QA',
22
+ description='A general question answering dataset for custom evaluation. '
23
+ 'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
24
+ tags=[Tags.QA, Tags.CUSTOM],
25
+ dataset_id='general_qa',
26
+ metric_list=['BLEU', 'Rouge'],
27
+ few_shot_num=0,
28
+ train_split=None,
29
+ eval_split='test',
30
+ prompt_template=PROMPT_TEMPLATE,
31
+ )
27
32
  )
28
- class GeneralQAAdapter(DataAdapter):
33
+ class GeneralQAAdapter(DefaultDataAdapter):
29
34
 
30
35
  def __init__(self, **kwargs):
31
36
  super().__init__(**kwargs)
32
37
 
33
- def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
34
- """
35
- Load dataset from the given path or dataset name.
36
-
37
- Args:
38
- dataset_name_or_path (str): Path to dataset directory or file.
39
- subset_list (list): List of subset names to load.
40
-
41
- Returns:
42
- dict: Loaded dataset organized by subset.
43
- """
44
- dataset_name_or_path = dataset_name_or_path or self.dataset_id
45
- subset_list = subset_list or self.subset_list
46
-
47
- data_file_dict = defaultdict(str)
48
- data_item_dict = defaultdict(list)
49
-
50
- # get data file path and subset name
51
- if os.path.isdir(dataset_name_or_path):
52
- for subset_name in subset_list:
53
- data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
54
- elif os.path.isfile(dataset_name_or_path):
55
- cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
56
- data_file_dict[cur_subset_name] = dataset_name_or_path
57
- else:
58
- raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
59
-
60
- # load data from local disk
61
- try:
62
- for subset_name, file_path in data_file_dict.items():
63
- data_item_dict[subset_name] = jsonl_to_list(file_path)
64
- except Exception as e:
65
- raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
66
-
67
- data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
68
-
69
- return data_dict
70
-
71
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
72
- """
73
- Generate prompt for the model based on input data.
74
-
75
- Args:
76
- input_d (dict): Input data dictionary.
77
- subset_name (str): Name of the subset.
78
- few_shot_list (list): List of few-shot examples.
79
-
80
- Returns:
81
- dict: Dictionary containing the generated prompt.
82
- """
83
- messages = input_d.get('messages')
84
- query = input_d.get('question', '') or input_d.get('query', '')
85
- system_prompt = input_d.get('system')
86
- prompt = self.prompt_template.format(query=query)
87
- return self.gen_prompt_data(prompt, system_prompt=system_prompt, messages=messages)
88
-
89
- def get_gold_answer(self, input_d: dict) -> str:
90
- """
91
- Extract the gold (reference) answer from the input data.
92
-
93
- Args:
94
- input_d (dict): Input data dictionary.
95
-
96
- Returns:
97
- str: Gold answer string.
98
- """
99
- return input_d.get('answer') or input_d.get('response')
100
-
101
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
102
- """
103
- Parse the prediction result.
104
-
105
- Args:
106
- result (str): Model prediction result.
107
- raw_input_d (dict, optional): Original input data.
108
- eval_type (str): Evaluation type.
38
+ def load_from_disk(self, **kwargs):
39
+ return super().load_from_disk(use_local_loader=True)
109
40
 
110
- Returns:
111
- str: Parsed prediction result.
112
- """
113
- return result
114
-
115
- def match(self, gold: str, pred: str) -> dict:
41
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
116
42
  """
117
- Compute metric scores between gold and predicted answers.
43
+ Convert a data record to a Sample object.
118
44
 
119
45
  Args:
120
- gold (str): Gold answer.
121
- pred (str): Predicted answer.
46
+ record (Dict[str, Any]): Input data record.
122
47
 
123
48
  Returns:
124
- dict: Dictionary of computed metric scores.
49
+ Sample: Sample object with input, target, and metadata.
125
50
  """
126
- # reference free metrics
127
- if gold is None:
128
- return {'AverageAccuracy': -1}
129
-
130
- # calculate rouge and bleu scores
131
- res = dict()
132
- if 'AverageRouge' in self.metric_list:
133
- from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
134
-
135
- rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
136
- res.update(rouge_dict)
137
- if 'AverageBLEU' in self.metric_list:
138
- from evalscope.metrics import bleu_ngram_one_sample
51
+ query = record.get('question') or record.get('query')
52
+ answer = record.get('answer') or record.get('response')
53
+ system_prompt = record.get('system')
54
+ messages = record.get('messages')
55
+
56
+ message_list = []
57
+ if messages:
58
+ message_list = [dict_to_chat_message(m) for m in messages]
59
+ else:
60
+ if system_prompt:
61
+ message_list.append(ChatMessageSystem(content=system_prompt))
62
+ message_list.append(ChatMessageUser(content=query))
139
63
 
140
- bleu_dict = bleu_ngram_one_sample(pred, gold)
141
- res.update(bleu_dict)
142
- return res
64
+ return Sample(input=message_list, target=answer or '')
143
65
 
144
- def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
66
+ def match_score(
67
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
68
+ ) -> Score:
145
69
  """
146
- Compute weighted mean of the metric scores for all samples.
147
-
148
- Args:
149
- review_res_list (list): List of metric score dictionaries.
150
-
151
- Returns:
152
- list: List of dictionaries with averaged metric results.
70
+ Calculate evaluation scores by comparing prediction with reference.
153
71
  """
154
- items = super().compute_dict_metric(review_res_list, **kwargs)
155
- return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
72
+ # Initialize the score object with prediction details
73
+ score = Score(
74
+ extracted_prediction=filtered_prediction,
75
+ prediction=original_prediction,
76
+ )
77
+
78
+ # Calculate scores for each configured metric
79
+ for metric in self.metric_list:
80
+ try:
81
+ if metric == 'Rouge':
82
+ from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
83
+
84
+ score.value.update(compute_rouge_score_one_sample_zh([filtered_prediction], [reference]))
85
+ elif metric == 'BLEU':
86
+ from evalscope.metrics import bleu_ngram_one_sample
87
+
88
+ score.value.update(bleu_ngram_one_sample(filtered_prediction, reference))
89
+ except Exception as e:
90
+ logger.error(f'Error calculating metric {metric}: {e}')
91
+ return None
92
+
93
+ score.main_score_name = 'Rouge-L-R'
94
+ return score
@@ -1,63 +1,70 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
1
3
  import os
2
4
  import random
3
5
  import re
4
-
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType, OutputType
7
- from evalscope.metrics import exact_match
8
-
9
-
10
- @Benchmark.register(
11
- name='gpqa',
12
- pretty_name='GPQA',
13
- tags=['MCQ', 'Knowledge'],
14
- description=
15
- 'GPQA is a dataset for evaluating the reasoning ability of large language models (LLMs) on complex mathematical problems. It contains questions that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
16
- dataset_id='modelscope/gpqa',
17
- model_adapter=OutputType.GENERATION,
18
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
19
- subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
20
- metric_list=['AveragePass@1'],
21
- few_shot_num=5,
22
- train_split=None,
23
- eval_split='train', # only have train split
24
- prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
6
+ from typing import Any, Dict
7
+
8
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
9
+ from evalscope.api.dataset import Sample
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
12
+ from evalscope.utils.logger import get_logger
13
+ from evalscope.utils.multi_choices import FEW_SHOT_TEMPLATE, MultipleChoiceTemplate
14
+
15
+ logger = get_logger()
16
+
17
+
18
+ @register_benchmark(
19
+ BenchmarkMeta(
20
+ name='gpqa_diamond',
21
+ pretty_name='GPQA-Diamond',
22
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
23
+ description=
24
+ 'GPQA is a dataset for evaluating the reasoning ability of large language models (LLMs) on complex mathematical problems. It contains questions that require step-by-step reasoning to arrive at the correct answer.', # noqa: E501
25
+ dataset_id='AI-ModelScope/gpqa_diamond',
26
+ metric_list=['acc'],
27
+ few_shot_num=0,
28
+ train_split=None,
29
+ eval_split='train', # only have train split
30
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER_COT,
31
+ )
25
32
  )
26
- class GPQAAdapter(DataAdapter):
33
+ class GPQAAdapter(MultiChoiceAdapter):
27
34
 
28
35
  def __init__(self, **kwargs):
29
36
  super().__init__(**kwargs)
30
37
 
31
- self.choices = ['A', 'B', 'C', 'D']
32
- if self.few_shot_num and self.few_shot_num > 0:
33
- self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
34
- self.prompt_prefix += open(
35
- os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'), 'r',
36
- encoding='utf-8').read() + '\nQuestion: '
37
- else:
38
- self.prompt_prefix = 'What is the correct answer to this question:'
39
-
40
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
41
- """
42
- Generate model prompt from input data.
43
- example:
44
- {
45
- "question":"Two people are playing the following game. A fair coin is tossed into the air. Person A says that in a single toss of the coin, the tail will come. So it's like the first shot or the third shot or the fifth shot. Person B says that the coin will come with a double toss. So like the second, fourth, sixth or eighth shot. Imagine this game played forever. What is the probability that person A wins this game?",
46
- "choice1":"1/2",
47
- "choice2":"1/4",
48
- "choice3":"2/3",
49
- "choice4":"1/8",
50
- "answer":"C",
51
- }
52
- """ # noqa: E501
53
- processed_input_d = self.__process_input(input_d)
54
- input_d['answer'] = processed_input_d['answer'] # add answer to input_d for answer extraction
55
- query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}" # noqa: E501
56
-
57
- prompt = self.prompt_template.format(query=query)
58
- return self.gen_prompt_data(prompt)
59
-
60
- def __process_input(self, input_d: dict) -> dict:
38
+ if self.few_shot_num > 0 and self.few_shot_num != 5:
39
+ logger.warning(
40
+ f'Only support few_shot_num 0 or 5 for {self.dataset_id}, but got {self.few_shot_num}. Use 5-shot by default.' # noqa: E501
41
+ )
42
+ self.few_shot_num = 5
43
+
44
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
45
+ # Process the input to create shuffled choices and correct answer
46
+ processed_data = self._process_input(record)
47
+
48
+ return Sample(
49
+ input=record['Question'],
50
+ choices=processed_data['choices'],
51
+ target=processed_data['answer'],
52
+ subset_key=record.get('subset', ''),
53
+ metadata={
54
+ 'correct_answer':
55
+ record['Correct Answer'],
56
+ 'incorrect_answers':
57
+ [record['Incorrect Answer 1'], record['Incorrect Answer 2'], record['Incorrect Answer 3']],
58
+ },
59
+ )
60
+
61
+ def format_fewshot_template(self, fewshot, sample):
62
+ from .prompt import FEW_SHOT_SAMPLES
63
+
64
+ return FEW_SHOT_TEMPLATE.format(fewshot=FEW_SHOT_SAMPLES, ) + self.format_prompt_template(sample)
65
+
66
+ def _process_input(self, input_d: dict) -> dict:
67
+ """Process input to shuffle choices and determine correct answer letter."""
61
68
 
62
69
  def preprocess(text):
63
70
  if text is None:
@@ -77,53 +84,7 @@ class GPQAAdapter(DataAdapter):
77
84
  random.shuffle(choices)
78
85
  correct_answer_index = choices.index(preprocess(input_d['Correct Answer']))
79
86
 
80
- out_doc = {
81
- 'choices': [choices[0], choices[1], choices[2], choices[3]],
87
+ return {
88
+ 'choices': choices,
82
89
  'answer': f'{chr(65 + correct_answer_index)}',
83
90
  }
84
- return out_doc
85
-
86
- def __form_options(self, options: list):
87
- option_str = 'Choices:\n'
88
- for opt, choice in zip(options, self.choices):
89
- option_str += f'({choice}) {opt}' + '\n'
90
- return option_str
91
-
92
- def get_gold_answer(self, input_d: dict) -> str:
93
- """
94
- Parse the raw input labels (gold).
95
- """
96
- return input_d['answer']
97
-
98
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
99
- """
100
- Parse the predicted result and extract proper answer.
101
- """
102
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
103
- return result
104
- else:
105
- return GPQAAdapter.get_multiple_choice_answer(result)
106
-
107
- def match(self, gold: str, pred: str) -> float:
108
- """
109
- Match the gold answer and the predicted answer.
110
- """
111
- return exact_match(gold=gold, pred=pred)
112
-
113
- @staticmethod
114
- def get_multiple_choice_answer(pred: str):
115
- tmp = re.findall(r'\b(A|B|C|D)\b', pred.upper())
116
- if tmp:
117
- pred = tmp
118
- else:
119
- pred = [pred.strip().strip('.')]
120
-
121
- if len(pred) == 0:
122
- pred = ''
123
- else:
124
- pred = pred[-1]
125
-
126
- # Remove the period at the end, again!
127
- pred = pred.rstrip('.').rstrip('/')
128
-
129
- return pred
@@ -1,3 +1,5 @@
1
+ # flake8: noqa
2
+ FEW_SHOT_SAMPLES = """
1
3
  Question: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?
2
4
  Choices:
3
5
  (A) 1/400
@@ -9,7 +11,8 @@ The expected proportion of individuals who carry the b allele but are not expect
9
11
  According to the Hardy-Weinberg equation p∧2 + 2pq + q∧2 = 1, where p is the frequency of dominant allele frequency, q is the frequency of recessive allele frequency, p∧2 is the frequency of the homozygous dominant allele, q∧2 is the frequency of the recessive allele, and 2pq is the frequency of the heterozygous allele.
10
12
  Given that q∧2=1/400, hence, q=0.05 and p=1-q=0.95.
11
13
  The frequency of the heterozygous allele is 2pq=2*0.05*0.95=38/400.
12
- The correct answer is (D)
14
+ ANSWER: D
15
+
13
16
  Question: A Fe pellet of 0.056 g is first dissolved in 10 mL of hydrobromic acid HBr (0.1 M). The resulting solution is then titrated by KMnO4 (0.02 M). How many equivalence points are there?
14
17
  Choices:
15
18
  (A) Two points, 25 ml and 35 ml
@@ -30,7 +33,8 @@ Reaction 1: MnO4- + 5Fe2+ + 8H+ → Mn2+ + 5Fe3+ + 4H2O
30
33
  Reaction 2: 2MnO4- + 10Br- + 16H+ → 2Mn2+ + 5Br2 + 8H2O
31
34
  So MnO4- will first react with Fe2+ with a stoichiometry of 1:5 so Veq1 will be 10 ml.
32
35
  Then when Fe2+ is used up, MnO4- will react with Br- with a stoichiometry of 2:10 then V added will be 25 ml so Veq2=25+10=35 ml.
33
- The correct answer is (A)
36
+ ANSWER: A
37
+
34
38
  Question: Consider a quantum mechanical system containing a particle of mass $m$ moving in an istropic three dimensional potential of the form $V(r) = 1/2 m \omega^2 r^2$ corresponding to the acted force obeying Hooke’s law. Here, $\omega$ is the angular frequency of oscillation and $r$ is the radial distance of the particle from the origin in spherical polar coordinate. What is the value of energy of the third excited state, and how many linearly independent eigenfunctions are possible for the same energy eigenvalue?
35
39
  Choices:
36
40
  (A) 11 \pi^2 \hbar^2 / (2m r^2), 3
@@ -45,7 +49,8 @@ For third excited state n=3.
45
49
  Thus the corresponding energy is $(9/2)\hbar \omega$.
46
50
  The degeneracy of the state is $g_n= (n+1)(n+2)/2$.
47
51
  For n=3, degeneracy is (3+1)*(3+2)/2=4*5/2=10.
48
- The correct answer is (B)
52
+ ANSWER: B
53
+
49
54
  Question: "Your overhear two chemists talking to each other as they leave a synthetic organic chemistry lab. One asks the other "So, how did it go?" The second chemist replies, "Not well - my compounds are on top of each other." What is the second chemist most likely referring to?"
50
55
  Choices:
51
56
  (A) The compounds they are working with have similar polarities.
@@ -55,7 +60,8 @@ Choices:
55
60
  Let's think step by step:
56
61
  "On top of each other" commonly refers to two compounds that have similar Rf values on chromatography (a common operation in synthetic chemistry).
57
62
  Similar Rf values arise for compounds with similar polarities.
58
- The correct answer is (A)
63
+ ANSWER: A
64
+
59
65
  Question: Two people are playing the following game. A fair coin is tossed into the air. Person A says that in a single toss of the coin, the tail will come. So it's like the first shot or the third shot or the fifth shot. Person B says that the coin will come with a double toss. So like the second, fourth, sixth or eighth shot. Imagine this game played forever. What is the probability that person A wins this game?
60
66
  Choices:
61
67
  (A) 1/2
@@ -78,4 +84,5 @@ The solution for this series is as follows : a1/(1-r) where a1 is the first numb
78
84
  a1=1/2
79
85
  r=(1/2)^2=1/4
80
86
  So a1/(1-r)=(1/2)/(1-1/4)=(1/2)/(3/4)=2/3.
81
- The correct answer is (C)
87
+ ANSWER: C
88
+ """