evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,73 +1,15 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import csv
3
- import os
4
- from collections import defaultdict
5
-
6
- from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType, OutputType
8
- from evalscope.metrics import exact_match
9
- from evalscope.metrics.completion_parsers import ResponseParser
10
- from evalscope.utils.io_utils import csv_to_list
11
- from evalscope.utils.logger import get_logger
12
2
 
13
- # flake8: noqa
3
+ from functools import partial
4
+ from typing import Any, Dict
14
5
 
15
- logger = get_logger()
6
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
7
+ from evalscope.api.dataset import Dataset, RemoteDataLoader, Sample
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.logger import get_logger
16
11
 
17
- SUBSET_LIST = [
18
- 'computer_network',
19
- 'operating_system',
20
- 'computer_architecture',
21
- 'college_programming',
22
- 'college_physics',
23
- 'college_chemistry',
24
- 'advanced_mathematics',
25
- 'probability_and_statistics',
26
- 'discrete_mathematics',
27
- 'electrical_engineer',
28
- 'metrology_engineer',
29
- 'high_school_mathematics',
30
- 'high_school_physics',
31
- 'high_school_chemistry',
32
- 'high_school_biology',
33
- 'middle_school_mathematics',
34
- 'middle_school_biology',
35
- 'middle_school_physics',
36
- 'middle_school_chemistry',
37
- 'veterinary_medicine',
38
- 'college_economics',
39
- 'business_administration',
40
- 'marxism',
41
- 'mao_zedong_thought',
42
- 'education_science',
43
- 'teacher_qualification',
44
- 'high_school_politics',
45
- 'high_school_geography',
46
- 'middle_school_politics',
47
- 'middle_school_geography',
48
- 'modern_chinese_history',
49
- 'ideological_and_moral_cultivation',
50
- 'logic',
51
- 'law',
52
- 'chinese_language_and_literature',
53
- 'art_studies',
54
- 'professional_tour_guide',
55
- 'legal_professional',
56
- 'high_school_chinese',
57
- 'high_school_history',
58
- 'middle_school_history',
59
- 'civil_servant',
60
- 'sports_science',
61
- 'plant_protection',
62
- 'basic_medicine',
63
- 'clinical_medicine',
64
- 'urban_and_rural_planner',
65
- 'accountant',
66
- 'fire_engineer',
67
- 'environmental_impact_assessment_engineer',
68
- 'tax_accountant',
69
- 'physician',
70
- ]
12
+ logger = get_logger()
71
13
 
72
14
  SUBJECT_MAPPING = {
73
15
  'computer_network': ['Computer Network', '计算机网络', 'STEM'],
@@ -124,115 +66,105 @@ SUBJECT_MAPPING = {
124
66
  'physician': ['Physician', '医师资格', 'Other']
125
67
  }
126
68
 
127
-
128
- @Benchmark.register(
129
- name='ceval',
130
- pretty_name='C-Eval',
131
- tags=['Knowledge', 'MCQ', 'Chinese'],
132
- description=
133
- 'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.', # noqa: E501
134
- dataset_id='modelscope/ceval-exam',
135
- model_adapter=OutputType.GENERATION,
136
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
137
- subset_list=SUBSET_LIST,
138
- metric_list=['AverageAccuracy'],
139
- few_shot_num=0,
140
- train_split='dev',
141
- eval_split='val',
142
- prompt_template=
143
- '以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
69
+ # Based on the prompt template for Chinese evaluation
70
+ USER_PROMPT_TEMPLATE = """以下是中国关于{subject}的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"答案:LETTER"(不带引号),其中 LETTER 是 A、B、C、D 中的一个。
71
+
72
+ 问题:{question}
73
+ 选项:
74
+ {choices}
75
+ """.lstrip() # noqa: E501
76
+
77
+ FEWSHOT_TEMPLATE = """以下是一些示例问题:
78
+
79
+ {fewshot}
80
+
81
+ """.lstrip()
82
+
83
+
84
+ @register_benchmark(
85
+ BenchmarkMeta(
86
+ name='ceval',
87
+ pretty_name='C-Eval',
88
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.CHINESE],
89
+ description=
90
+ 'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.', # noqa: E501
91
+ dataset_id='evalscope/ceval',
92
+ subset_list=list(SUBJECT_MAPPING.keys()),
93
+ metric_list=['acc'],
94
+ few_shot_num=5,
95
+ train_split='dev',
96
+ eval_split='val',
97
+ prompt_template=USER_PROMPT_TEMPLATE,
98
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
99
+ )
144
100
  )
145
- class CEVALAdapter(DataAdapter):
101
+ class CEVALAdapter(MultiChoiceAdapter):
146
102
 
147
103
  def __init__(self, **kwargs):
148
104
 
149
- few_shot_num = kwargs.get('few_shot_num', 0)
150
- if few_shot_num > 5:
151
- logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
152
- kwargs['few_shot_num'] = 5
153
105
  super().__init__(**kwargs)
154
106
 
155
107
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
156
- self.choices = ['A', 'B', 'C', 'D']
157
-
158
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
159
- data_dict = defaultdict(dict)
160
- for subset_name in subset_list:
161
- for split_name in [self.train_split, self.eval_split]:
162
- if os.path.exists(dataset_name_or_path):
163
- file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}.csv')
164
- else:
165
- file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
166
- if os.path.exists(file_path):
167
- data_dict[subset_name][split_name] = csv_to_list(file_path)
168
-
169
- return data_dict
170
-
171
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
172
- """
173
- Generate model prompt from raw input, unify the prompt format for C-Eval benchmark.
174
-
175
- Args:
176
- input_d (dict): The raw input. A single data format of the C-Eval:
177
-
178
- {'id': 0,
179
- 'question': '下列关于税法基本原则的表述中,不正确的是____。',
180
- 'A': '税收法定原则包括税收要件法定原则和税务合法性原则',
181
- 'B': '税收公平原则源于法律上的平等性原则',
182
- 'C': '税收效率原则包含经济效率和行政效率两个方面',
183
- 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定',
184
- 'answer': 'D',
185
- 'explanation': ''}
186
-
187
- Returns:
188
- {'data': ['prompt ...']}
189
- """
190
-
191
- few_shot_prompts = [self._format_example(input_d=sample, include_answer=True) for sample in few_shot_list]
192
-
193
- if len(few_shot_prompts) > 0:
194
- context: str = '\n'.join(few_shot_prompts) + '\n'
195
- else:
196
- context = ''
197
-
198
- query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
199
-
200
- subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
201
- full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
202
108
 
203
- return self.gen_prompt_data(full_prompt)
204
-
205
- def get_gold_answer(self, input_d: dict) -> str:
206
- # Get the gold choice
207
- return input_d.get('answer', '')
208
-
209
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
109
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
110
+ # Build choices list from A, B, C, D fields
111
+ choices = [record['A'], record['B'], record['C'], record['D']]
112
+ subset = self.current_subset_name
113
+
114
+ return Sample(
115
+ input=record['question'],
116
+ choices=choices,
117
+ target=record['answer'],
118
+ metadata={
119
+ 'id': record.get('id', ''),
120
+ 'explanation': record.get('explanation', ''),
121
+ 'subject': subset
122
+ },
123
+ )
124
+
125
+ def sample_to_fewshot(self, sample: Sample) -> str:
126
+ q_str = f"""问题:{sample.input}"""
127
+ choices = sample.choices if sample.choices is not None else []
128
+ opt_str_list = []
129
+ for i, choice in enumerate(choices):
130
+ opt_str_list.append(f"""{chr(65 + i)}. {choice}""")
131
+ opt_str = '\n'.join(opt_str_list)
132
+ opt_str = f"""选项:\n{opt_str}"""
133
+ exp_str = f"""解析:{sample.metadata.get('explanation', '')}"""
134
+ ans_str = f"""答案:{sample.target}"""
135
+ final_str = '\n'.join([q_str, opt_str, exp_str, ans_str])
136
+
137
+ return final_str
138
+
139
+ def format_fewshot_template(self, fewshot, sample):
140
+ fewshot_str = FEWSHOT_TEMPLATE.format(fewshot=fewshot)
141
+ prompt_str = self.format_prompt_template(sample)
142
+ return fewshot_str + '\n' + prompt_str
143
+
144
+ def format_prompt_template(self, sample):
145
+ subject_name = SUBJECT_MAPPING.get(sample.metadata['subject'])[1]
146
+ choices = sample.choices if sample.choices is not None else []
147
+ choices_str = '\n'.join([f'{chr(65 + i)}. {choice}' for i, choice in enumerate(choices)])
148
+
149
+ return USER_PROMPT_TEMPLATE.format(subject=subject_name, question=sample.input, choices=choices_str)
150
+
151
+ def extract_answer(self, prediction, task_state) -> str:
210
152
  """
211
- Parse the model output to get the answer. Could be the best choice index.
153
+ Extract the answer from the prediction based on the task state.
212
154
 
213
155
  Args:
214
- result: Predicted answer from the model. Usually a string for chat.
215
- raw_input_d (dict): The raw input. Depending on the dataset.
216
- eval_type: `checkpoint` or `service` or `custom`. Default is `checkpoint`.
156
+ prediction (str): The model's prediction string
157
+ task_state (dict): The current task state containing metadata
217
158
 
218
159
  Returns:
219
- The parsed answer. Depending on the dataset. Usually a string for chat.
160
+ str: The extracted answer from the prediction
220
161
  """
221
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
222
- return result
223
- else:
224
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
225
-
226
- def match(self, gold: str, pred: str) -> float:
227
- return exact_match(gold=gold, pred=pred)
228
-
229
- def _format_example(self, input_d: dict, include_answer=True):
230
- example = '问题:' + input_d['question']
231
- for choice in self.choices:
232
- example += f'\n{choice}. {input_d[f"{choice}"]}'
162
+ import re
233
163
 
234
- if include_answer:
235
- example += '\n答案: ' + input_d['answer'] + '\n\n'
164
+ # Use regex to find the answer in the format "答案:LETTER"
165
+ match = re.search(r'答案:([A-D])', prediction)
166
+ if match:
167
+ return match.group(1)
236
168
  else:
237
- example += '\n答案: '
238
- return example
169
+ logger.warning(f'No valid answer found in prediction: {prediction}')
170
+ return ''
@@ -1,12 +1,14 @@
1
1
  import re
2
- from typing import Any, List
3
-
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
6
10
  from evalscope.utils.logger import get_logger
7
11
 
8
- # flake8: noqa
9
-
10
12
  logger = get_logger()
11
13
 
12
14
  GRADER_TEMPLATE = """
@@ -40,7 +42,7 @@ GRADER_TEMPLATE = """
40
42
  模型预测6:你可能是想说贝茜和奥利维亚。不过您应通过最新的参考资料确认详细信息。那是正确的答案吗?
41
43
  ```
42
44
  这些答复均为【错误】,因为:
43
- - 答复中包含与标准答案矛盾的事实陈述。即使在陈述中略带保留(例如:“可能是”,“虽然我不确定,但我认为”),也视为错误。
45
+ - 答复中包含与标准答案矛盾的事实陈述。即使在陈述中略带保留(例如:"可能是","虽然我不确定,但我认为"),也视为错误。
44
46
 
45
47
  以下是【未尝试】的答复示例:
46
48
  ```
@@ -56,16 +58,16 @@ GRADER_TEMPLATE = """
56
58
  - 回复中没有与标准答案矛盾的陈述。
57
59
 
58
60
  另外注意以下几点:
59
- - 对于标准答案为数字的问题,预测答案应和标准答案一致。例如,考虑问题“金山铁路黄浦江特大桥的全长是多少米?”,标准答案为“3518.17”:
60
- - 预测答案“3518”、“3518.1”、“3518.17”均为【正确】。
61
- - 预测答案“3520”和“3600”均为【错误】。
62
- - 预测答案“大约3500米”和“超过3000米”被视为【未尝试】,因为它们既不确认也不与标准答案矛盾。
61
+ - 对于标准答案为数字的问题,预测答案应和标准答案一致。例如,考虑问题"金山铁路黄浦江特大桥的全长是多少米?",标准答案为"3518.17":
62
+ - 预测答案"3518"、"3518.1"、"3518.17"均为【正确】。
63
+ - 预测答案"3520"和"3600"均为【错误】。
64
+ - 预测答案"大约3500米"和"超过3000米"被视为【未尝试】,因为它们既不确认也不与标准答案矛盾。
63
65
  - 如果标准答案包含比问题更多的信息,预测答案只需包含问题中提到的信息。
64
- - 例如,考虑问题“菱镁矿的主要化学成分是什么?”标准答案为“碳酸镁(MgCO3)”。“碳酸镁”或“MgCO3”均视为【正确】答案。
66
+ - 例如,考虑问题"菱镁矿的主要化学成分是什么?"标准答案为"碳酸镁(MgCO3)"。"碳酸镁"或"MgCO3"均视为【正确】答案。
65
67
  - 如果从问题中明显可以推断出预测答案省略的信息,那么算作正确。
66
- - 例如,问题“巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产,那么这遗址在哪个地区?”标准答案为“意大利撒丁岛”,预测答案“撒丁岛”被视为【正确】。
68
+ - 例如,问题"巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产,那么这遗址在哪个地区?"标准答案为"意大利撒丁岛",预测答案"撒丁岛"被视为【正确】。
67
69
  - 如果能明显看出名字翻译版本不同但是是同一个人也认为正确。
68
- - 例如,如果标准答案是“Robinson”,那么回答鲁滨逊或者鲁滨孙均正确。
70
+ - 例如,如果标准答案是"Robinson",那么回答鲁滨逊或者鲁滨孙均正确。
69
71
 
70
72
  下面是一个新的问题示例。请只回复A、B、C之一,不要道歉或纠正自己的错误,只需要评估该回答。
71
73
  ```
@@ -80,88 +82,89 @@ B:【错误】
80
82
  C:【未尝试】
81
83
 
82
84
  只返回字母"A"、"B"或"C",无须添加其他文本。
83
- """.strip() # noqa E501
85
+ """.strip()
84
86
 
85
87
  SUBSET_LIST = ['中华文化', '人文与社会科学', '工程、技术与应用科学', '生活、艺术与文化', '社会', '自然与自然科学']
86
88
 
87
89
 
88
- @Benchmark.register(
89
- name='chinese_simpleqa',
90
- pretty_name='Chinese-SimpleQA',
91
- tags=['Knowledge', 'QA', 'Chinese'],
92
- description=
93
- "Chinese SimpleQA is a Chinese question-answering dataset designed to evaluate the performance of language models on simple factual questions. It includes a variety of topics and is structured to test the model's ability to understand and generate correct answers in Chinese.", # noqa: E501
94
- subset_list=SUBSET_LIST,
95
- dataset_id='AI-ModelScope/Chinese-SimpleQA',
96
- metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
97
- few_shot_num=0,
98
- train_split=None,
99
- eval_split='train')
100
- class ChineseSimpleQAAdapter(DataAdapter):
90
+ @register_benchmark(
91
+ BenchmarkMeta(
92
+ name='chinese_simpleqa',
93
+ pretty_name='Chinese-SimpleQA',
94
+ tags=[Tags.KNOWLEDGE, Tags.QA, Tags.CHINESE],
95
+ description=
96
+ "Chinese SimpleQA is a Chinese question-answering dataset designed to evaluate the performance of language models on simple factual questions. It includes a variety of topics and is structured to test the model's ability to understand and generate correct answers in Chinese.", # noqa: E501
97
+ subset_list=SUBSET_LIST,
98
+ dataset_id='AI-ModelScope/Chinese-SimpleQA',
99
+ metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
100
+ few_shot_num=0,
101
+ train_split=None,
102
+ eval_split='train',
103
+ prompt_template='请回答问题:\n\n{question}'
104
+ )
105
+ )
106
+ class ChineseSimpleQAAdapter(DefaultDataAdapter):
101
107
 
102
108
  def __init__(self, *args, **kwargs):
103
109
  super().__init__(*args, **kwargs)
104
110
 
105
- # register metrics
106
- metric_registry.register(Metric(name='is_correct', object=mean))
107
- metric_registry.register(Metric(name='is_incorrect', object=mean))
108
- metric_registry.register(Metric(name='is_not_attempted', object=mean))
109
-
110
- # whether to use LLM as a judge
111
- self.llm_as_a_judge = True
112
-
113
- def load(self, **kwargs):
114
- kwargs['subset_list'] = ['default']
115
- data_dict = super().load(**kwargs)
116
- return self.reformat_subset(data_dict, subset_key='primary_category', format='{}')
117
-
118
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
119
- question = input_d['question']
120
- return self.gen_prompt_data(question)
121
-
122
- def get_gold_answer(self, input_d: dict) -> str:
123
- return input_d['answer']
124
-
125
- def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
126
- return result.strip()
127
-
128
- def match(self, gold: str, pred: str) -> float:
129
- # simple match
130
- logger.warning(f'Please use LLMJudge to match the result for {self.name}')
131
- is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
132
- is_incorrect = not is_correct
133
- is_not_attempted = 0
134
- return {
135
- 'is_correct': is_correct,
136
- 'is_incorrect': is_incorrect,
137
- 'is_not_attempted': is_not_attempted,
111
+ self._use_llm_judge = True # Use LLM as a judge by default
112
+ self.reformat_subset = True # Reformat subset to primary_category
113
+
114
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
115
+ """
116
+ Convert a data record to a Sample object.
117
+
118
+ Args:
119
+ record (Dict[str, Any]): Input data record.
120
+
121
+ Returns:
122
+ Sample: Sample object with input, target, and metadata.
123
+ """
124
+ question = record['question']
125
+ answer = record['answer']
126
+ subset_key = record.get('primary_category', 'default') # Use primary_category as subset key
127
+ metadata = {
128
+ 'id': record.get('id', 'unknown'),
129
+ 'primary_category': subset_key,
130
+ 'secondary_category': record.get('secondary_category', '')
138
131
  }
139
132
 
140
- def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
141
- raw_input = kwargs.get('raw_input', None)
142
- question = raw_input['question']
143
- # get grading response
144
- prompt = GRADER_TEMPLATE.format(question=question, target=gold, predicted_answer=pred)
133
+ return Sample(input=question, target=answer, subset_key=subset_key, metadata=metadata)
134
+
135
+ def llm_match_score(
136
+ self,
137
+ original_prediction: str,
138
+ filtered_prediction: str,
139
+ reference: str,
140
+ task_state: TaskState,
141
+ ) -> Score:
142
+ score = Score(
143
+ extracted_prediction=filtered_prediction,
144
+ prediction=original_prediction,
145
+ )
146
+
147
+ question = task_state.input_text
148
+
149
+ # Request judge and obtain score
150
+ prompt = GRADER_TEMPLATE.format(question=question, target=reference, predicted_answer=filtered_prediction)
145
151
  system_prompt = '你是一个智能助手,请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。'
146
- grading_response = judge(prompt, system_prompt)
152
+ judge_response = self.llm_judge.judge(prompt, system_prompt=system_prompt)
147
153
  # parse grading response
148
- match = re.search(r'(A|B|C)', grading_response)
154
+ match = re.search(r'(A|B|C)', judge_response)
149
155
  res = match.group(0) if match else 'C'
150
- return {
156
+
157
+ # Set score based on the match result
158
+ score.value = {
151
159
  'is_correct': 1 if res == 'A' else 0,
152
160
  'is_incorrect': 1 if res == 'B' else 0,
153
161
  'is_not_attempted': 1 if res == 'C' else 0,
154
- 'judge_response': grading_response,
155
162
  }
156
-
157
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
158
- """
159
- compute weighted mean of the bleu score of all samples
160
-
161
- Args:
162
- review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
163
- """
164
- # zip dict answers
165
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
166
-
167
- return super().compute_metric(res_dict, **kwargs)
163
+ score.explanation = f'LLM judge: {judge_response}'
164
+ score.metadata = {
165
+ 'source': 'llm_judge',
166
+ 'judge_strategy': self.judge_strategy,
167
+ 'model': self.llm_judge.model_id
168
+ }
169
+ score.main_score_name = 'is_correct'
170
+ return score