evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
  """Registry of all instructions."""
15
15
 
16
- from evalscope.benchmarks.ifeval import instructions
16
+ from . import instructions
17
17
 
18
18
  _KEYWORD = 'keywords:'
19
19
 
@@ -1,7 +1,7 @@
1
1
  import dataclasses
2
2
  from typing import Dict, Optional, Union
3
3
 
4
- from evalscope.benchmarks.ifeval import instructions_registry
4
+ from . import instructions_registry
5
5
 
6
6
 
7
7
  @dataclasses.dataclass
@@ -121,14 +121,13 @@ def process_results(doc, results):
121
121
  out_loose = test_instruction_following_loose(inp, response)
122
122
 
123
123
  return {
124
- 'prompt_level_strict_acc': out_strict.follow_all_instructions,
125
- 'inst_level_strict_acc': out_strict.follow_instruction_list,
126
- 'prompt_level_loose_acc': out_loose.follow_all_instructions,
127
- 'inst_level_loose_acc': out_loose.follow_instruction_list,
124
+ 'prompt_level_strict': float(out_strict.follow_all_instructions),
125
+ 'inst_level_strict': agg_inst_level_acc(out_strict.follow_instruction_list),
126
+ 'prompt_level_loose': float(out_loose.follow_all_instructions),
127
+ 'inst_level_loose': agg_inst_level_acc(out_loose.follow_instruction_list),
128
128
  }
129
129
 
130
130
 
131
131
  def agg_inst_level_acc(items):
132
- flat_items = [item for sublist in items for item in sublist]
133
- inst_level_acc = sum(flat_items) / len(flat_items)
132
+ inst_level_acc = sum(items) / len(items) if items else 0
134
133
  return inst_level_acc
@@ -1,70 +1,35 @@
1
- from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import EvalType, OutputType
3
- from evalscope.metrics import exact_match
4
- from evalscope.metrics.completion_parsers import ResponseParser
5
-
6
-
7
- @Benchmark.register(
8
- name='iquiz',
9
- pretty_name='IQuiz',
10
- tags=['Knowledge', 'MCQ', 'Chinese'],
11
- description=
12
- 'IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.', # noqa: E501
13
- dataset_id='AI-ModelScope/IQuiz',
14
- model_adapter=OutputType.GENERATION,
15
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
16
- subset_list=['IQ', 'EQ'],
17
- metric_list=['AverageAccuracy'],
18
- few_shot_num=0,
19
- train_split=None,
20
- eval_split='test',
21
- system_prompt='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
1
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
2
+ from evalscope.api.dataset import Sample
3
+ from evalscope.api.registry import register_benchmark
4
+ from evalscope.constants import Tags
5
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
6
+
7
+
8
+ @register_benchmark(
9
+ BenchmarkMeta(
10
+ name='iquiz',
11
+ pretty_name='IQuiz',
12
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.CHINESE],
13
+ description=
14
+ 'IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.', # noqa: E501
15
+ dataset_id='AI-ModelScope/IQuiz',
16
+ metric_list=['acc'],
17
+ subset_list=['IQ', 'EQ'],
18
+ few_shot_num=0,
19
+ train_split=None,
20
+ eval_split='test',
21
+ prompt_template=MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
22
+ )
22
23
  )
23
- class IQuizAdapter(DataAdapter):
24
+ class IQuizAdapter(MultiChoiceAdapter):
24
25
 
25
26
  def __init__(self, **kwargs):
26
27
  super().__init__(**kwargs)
27
28
 
28
- self.choices = ['A', 'B', 'C', 'D', 'E']
29
-
30
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
31
- """
32
- Generate model prompt from input data.
33
- example:
34
- {
35
- "question":"天气预报说本周星期三会下雨,昨天果然下雨了,今天星期几?",
36
- "choices":["星期一","星期二","星期三","星期四"],
37
- "answer":"D",
38
- "level":1
39
- }
40
- """
41
- prompt = f"问题: {input_d['question']}\n"
42
- prompt += self.__form_options(input_d['choices'])
43
- return self.gen_prompt_data(prompt)
44
-
45
- def __form_options(self, options: list):
46
- option_str = '选项:\n'
47
- for opt, choice in zip(options, self.choices):
48
- option_str += f'({choice}): {opt}' + '\n'
49
- return option_str
50
-
51
- def get_gold_answer(self, input_d: dict) -> str:
52
- """
53
- Parse the raw input labels (gold).
54
- """
55
- return input_d['answer']
56
-
57
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
58
- """
59
- Parse the predicted result and extract proper answer.
60
- """
61
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
62
- return result
63
- else:
64
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
65
-
66
- def match(self, gold: str, pred: str) -> float:
67
- """
68
- Match the gold answer and the predicted answer.
69
- """
70
- return exact_match(gold=gold, pred=pred)
29
+ def record_to_sample(self, record) -> Sample:
30
+ return Sample(
31
+ input=record['question'],
32
+ choices=record['choices'],
33
+ target=record['answer'],
34
+ metadata={'level': record.get('level', 'unknown')},
35
+ )
@@ -130,8 +130,8 @@ def evaluate_generations(
130
130
  results[index] = result
131
131
  metadata[index] = meta
132
132
 
133
- assert len(results) == len(
134
- generations_list), f'results = {len(results)} inputs = {len(generations_list)} {results=}'
133
+ assert len(results
134
+ ) == len(generations_list), f'results = {len(results)} inputs = {len(generations_list)} {results=}'
135
135
 
136
136
  return results, metadata
137
137
 
@@ -1,88 +1,138 @@
1
- from tqdm import tqdm
2
-
3
- from evalscope.benchmarks import Benchmark, DataAdapter
1
+ from typing import Any, Dict
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages.chat_message import ChatMessageUser
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import convert_numpy_types
4
11
  from evalscope.utils.logger import get_logger
5
12
 
6
13
  logger = get_logger()
7
14
 
8
15
 
9
- @Benchmark.register(
10
- name='live_code_bench',
11
- pretty_name='Live-Code-Bench',
12
- tags=['Coding'],
13
- description=
14
- 'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.', # noqa: E501
15
- dataset_id='AI-ModelScope/code_generation_lite',
16
- subset_list=['release_latest'],
17
- metric_list=['Pass@1'],
18
- few_shot_num=0,
19
- train_split=None,
20
- eval_split='test',
21
- extra_params={
22
- 'start_date': None,
23
- 'end_date': None,
24
- 'timeout': 6,
25
- 'debug': False
26
- },
27
- system_prompt=
28
- 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.', # noqa: E501
29
- prompt_template=
30
- '### Question:\n{question_content}\n\n{format_prompt} ### Answer: (use the provided format with backticks)\n\n', # noqa: E501
16
+ @register_benchmark(
17
+ BenchmarkMeta(
18
+ name='live_code_bench',
19
+ pretty_name='Live-Code-Bench',
20
+ tags=[Tags.CODING],
21
+ description=
22
+ 'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.', # noqa: E501
23
+ dataset_id='AI-ModelScope/code_generation_lite',
24
+ subset_list=['release_latest'],
25
+ metric_list=['Pass@1'],
26
+ eval_split='test',
27
+ prompt_template=
28
+ '### Question:\n{question_content}\n\n{format_prompt} ### Answer: (use the provided format with backticks)\n\n',
29
+ extra_params={
30
+ 'start_date': None,
31
+ 'end_date': None,
32
+ 'timeout': 6,
33
+ 'debug': False
34
+ },
35
+ )
31
36
  )
32
- class LiveCodeBenchAdapter(DataAdapter):
37
+ class LiveCodeBenchAdapter(DefaultDataAdapter):
38
+ """
39
+ Live Code Bench adapter using the new data processing framework.
40
+ """
33
41
 
34
42
  def __init__(self, **kwargs):
35
43
  super().__init__(**kwargs)
36
44
 
37
- extra_params = kwargs.get('extra_params', {})
38
-
39
- self.timeout = extra_params.get('timeout', 6)
40
- self.debug = extra_params.get('debug', False)
41
- self.start_date = extra_params.get('start_date')
42
- self.end_date = extra_params.get('end_date')
43
-
44
- def load(self, **kwargs) -> dict:
45
- from .load_utils import filter_date, transform
46
-
47
- # Note: need trust_remote_code=True to load the python script
48
- dataset_dict = super().load(trust_remote_code=True, **kwargs)
49
- new_dataset_dict = {}
50
- for subset_key, dataset in dataset_dict.items():
51
- datasets = dataset[self.eval_split]
52
- filtered_datasets = filter_date(datasets, start_date=self.start_date, end_date=self.end_date)
53
-
54
- transformed_datasets = [transform(item) for item in tqdm(filtered_datasets, desc='Transforming data')]
55
- new_dataset_dict[subset_key] = {self.eval_split: transformed_datasets}
56
- return new_dataset_dict
57
-
58
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
59
- """
60
- Generate the prompt for the model input.
61
- """
62
- format_prompt = input_d['format_prompt']
63
- question_content = input_d['question_content']
45
+ self.timeout = self.extra_params.get('timeout', 6)
46
+ self.debug = self.extra_params.get('debug', False)
47
+ self.start_date = self.extra_params.get('start_date')
48
+ self.end_date = self.extra_params.get('end_date')
49
+
50
+ self.save_metadata = False # Don't save metadata, since they are large
51
+
52
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
53
+ """Convert a data record to a Sample object."""
54
+ from .load_utils import transform
55
+
56
+ record = transform(record)
57
+
58
+ question_content = record['question_content']
59
+ format_prompt = record['format_prompt']
64
60
  full_prompt = self.prompt_template.format(question_content=question_content, format_prompt=format_prompt)
65
61
 
66
- return self.gen_prompt_data(full_prompt)
62
+ return Sample(
63
+ input=[ChatMessageUser(content=full_prompt)],
64
+ target='',
65
+ metadata={
66
+ 'evaluation_sample': record['evaluation_sample'],
67
+ 'contest_date': record['contest_date']
68
+ }
69
+ )
67
70
 
68
- def get_gold_answer(self, input_d: dict) -> str:
69
- # Extract the gold answer from the input dict.
70
- return input_d
71
+ def sample_filter(self, sample):
72
+ from .load_utils import filter_date
71
73
 
72
- def match(self, gold: dict, pred: str) -> float:
73
- from .evaluate_utils import codegen_metrics
74
+ return filter_date(sample.metadata['contest_date'], start_date=self.start_date, end_date=self.end_date)
75
+
76
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
77
+ """Extract code from the prediction."""
74
78
  from .extract_utils import extract_code_generation
79
+ return extract_code_generation(prediction)
75
80
 
76
- ext_pred = extract_code_generation(pred)
77
-
78
- references = [{'input_output': gold['evaluation_sample']}]
79
- predictions = [[ext_pred]]
80
- metrics, eval_results, final_metadata = codegen_metrics(
81
- references,
82
- predictions,
83
- k_list=[1],
84
- num_process_evaluate=1,
85
- timeout=self.timeout,
86
- debug=self.debug,
81
+ def match_score(
82
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
83
+ ) -> Score:
84
+ from .evaluate_utils import codegen_metrics
85
+
86
+ score = Score(
87
+ extracted_prediction=filtered_prediction,
88
+ prediction=original_prediction,
87
89
  )
88
- return metrics['pass@1'] / 100 # convert to point scale
90
+
91
+ references = [{'input_output': task_state.metadata['evaluation_sample']}]
92
+ predictions = [[filtered_prediction]]
93
+
94
+ try:
95
+ metrics, eval_results, final_metadata = codegen_metrics(
96
+ references,
97
+ predictions,
98
+ k_list=[1],
99
+ num_process_evaluate=1,
100
+ timeout=self.timeout,
101
+ debug=self.debug,
102
+ )
103
+ pass_rate = metrics['pass@1'] / 100 # convert to point scale
104
+
105
+ score.value = {'pass': float(pass_rate > 0)}
106
+ score.explanation = f"Pass@1: {metrics['pass@1']}%"
107
+
108
+ # Convert numpy types to native Python types for JSON serialization
109
+ serializable_eval_results = convert_numpy_types(eval_results)
110
+ serializable_final_metadata = convert_numpy_types(final_metadata)
111
+
112
+ score.metadata = {
113
+ 'pass_rate': float(pass_rate),
114
+ 'timeout': self.timeout,
115
+ 'debug': self.debug,
116
+ 'eval_results': serializable_eval_results,
117
+ 'final_metadata': serializable_final_metadata
118
+ }
119
+ except Exception as e:
120
+ score.value = {'pass': False}
121
+ score.explanation = f'Evaluation failed: {str(e)}'
122
+ score.metadata = {'error': str(e)}
123
+
124
+ score.main_score_name = 'pass'
125
+ return score
126
+
127
+ def aggregate_scores(self, sample_scores):
128
+ from evalscope.metrics.metric import PassAtK
129
+
130
+ # calculate pass@k here
131
+ agg_list = []
132
+ for metric in self.metric_list:
133
+ if metric.lower().startswith('pass@'):
134
+ k = int(metric.split('@')[1])
135
+ # Get the scores for this metric
136
+ agg = PassAtK(k)
137
+ agg_list.extend(agg(sample_scores))
138
+ return agg_list
@@ -32,8 +32,8 @@ def transform(item):
32
32
  private_test_cases = json.loads(item['private_test_cases'])
33
33
  except Exception as e: # noqa: F841
34
34
  private_test_cases = json.loads(
35
- pickle.loads(zlib.decompress(base64.b64decode(private_test_cases.encode('utf-8')) # type: ignore
36
- ))) # type: ignore
35
+ pickle.loads(zlib.decompress(base64.b64decode(private_test_cases.encode('utf-8'))))
36
+ )
37
37
 
38
38
  # load metadata
39
39
  metadata = json.loads(item['metadata'])
@@ -47,25 +47,17 @@ def transform(item):
47
47
  return item
48
48
 
49
49
 
50
- def filter_date(dataset, start_date=None, end_date=None):
51
- new_dataset = []
52
-
53
- for item in dataset:
54
- contest_date = datetime.fromisoformat(item['contest_date'])
55
- if start_date is not None:
56
- p_start_date = datetime.strptime(start_date, '%Y-%m-%d')
57
- if p_start_date > contest_date:
58
- continue
50
+ def filter_date(contest_date, start_date=None, end_date=None) -> bool:
59
51
 
60
- if end_date is not None:
61
- p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
62
- if p_end_date < contest_date:
63
- continue
52
+ contest_date = datetime.fromisoformat(contest_date)
53
+ if start_date is not None:
54
+ p_start_date = datetime.strptime(start_date, '%Y-%m-%d')
55
+ if p_start_date > contest_date:
56
+ return False
64
57
 
65
- new_dataset.append(item)
58
+ if end_date is not None:
59
+ p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
60
+ if p_end_date < contest_date:
61
+ return False
66
62
 
67
- if start_date or end_date:
68
- logger.info(
69
- f'Filtered dataset with start_date: {start_date}, end_date: {end_date}, remaining items: {len(new_dataset)}'
70
- )
71
- return new_dataset
63
+ return True
@@ -4,18 +4,22 @@ import faulthandler
4
4
  import json
5
5
  import numpy as np
6
6
  import platform
7
+
7
8
  # to run the solution files we're using a timing based approach
8
9
  import signal
9
10
  import sys
10
11
  import time
12
+
11
13
  # used for debugging to time steps
12
14
  from datetime import datetime
13
15
  from decimal import Decimal
14
16
  from enum import Enum
15
17
  from functools import partial
16
18
  from io import StringIO
19
+
17
20
  # from pyext import RuntimeModule
18
21
  from types import ModuleType
22
+
19
23
  # used for testing the code that reads from input
20
24
  from unittest.mock import mock_open, patch
21
25
 
@@ -342,8 +346,8 @@ def grade_stdio(
342
346
  return all_results, WA_send_args
343
347
 
344
348
  for output_line_idx, (
345
- stripped_prediction_line,
346
- stripped_gt_out_line,
349
+ stripped_prediction_line,
350
+ stripped_gt_out_line,
347
351
  ) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
348
352
  WA_send_args['error_message'] = (
349
353
  f'Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}'
@@ -1,82 +1,56 @@
1
1
  from typing import Any
2
2
 
3
- from evalscope.benchmarks import Benchmark, DataAdapter
4
- from evalscope.constants import EvalType, OutputType
5
- from evalscope.metrics import exact_match
6
- from evalscope.metrics.completion_parsers import ResponseParser
7
-
8
- SUBSET_LIST = ['default']
9
-
10
-
11
- @Benchmark.register(
12
- name='maritime_bench',
13
- pretty_name='MaritimeBench',
14
- tags=['Maritime', 'MCQ', 'Knowledge'],
15
- description=
16
- 'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
17
- dataset_id='HiDolphin/MaritimeBench',
18
- model_adapter=OutputType.GENERATION,
19
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
20
- subset_list=SUBSET_LIST,
21
- metric_list=['AverageAccuracy'],
22
- eval_split='test',
23
- prompt_template=
24
- '题目来自于{subset_name}请回答单选题。要求只输出选项,不输出解释,将选项放在<>里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:<A> 当前题目\n {query}', # noqa: E501
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
7
+
8
+ MARITIME_PROMPT_TEMPLATE = '请回答单选题。要求只输出选项,不输出解释,将选项放在[]里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:[A]\n 当前题目\n {question}\n选项:\n{choices}' # noqa: E501
9
+
10
+
11
+ @register_benchmark(
12
+ BenchmarkMeta(
13
+ name='maritime_bench',
14
+ pretty_name='MaritimeBench',
15
+ tags=[Tags.CHINESE, Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
16
+ description=
17
+ 'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.', # noqa: E501
18
+ dataset_id='HiDolphin/MaritimeBench',
19
+ metric_list=['acc'],
20
+ few_shot_num=0,
21
+ eval_split='test',
22
+ prompt_template=MARITIME_PROMPT_TEMPLATE,
23
+ )
25
24
  )
26
- class MaritimeBenchAdapter(DataAdapter):
25
+ class MaritimeBenchAdapter(MultiChoiceAdapter):
27
26
 
28
27
  def __init__(self, **kwargs):
29
28
  super().__init__(**kwargs)
30
29
 
31
- self.choices = ['A', 'B', 'C', 'D']
32
-
33
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
34
-
35
- prefix = ''
36
- query = prefix + input_d['question'] + '\n'
37
- available_choices = []
38
- for option in self.choices:
39
- if option in input_d and input_d[option]:
40
- query += option + ':' + input_d[option] + '\n'
41
- available_choices.append(option)
42
-
43
- full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
44
- return self.gen_prompt_data(full_prompt, choices=available_choices)
45
-
46
- def get_gold_answer(self, input_d: dict) -> str:
47
- """
48
- Parse the raw input labels (gold).
49
-
50
- Args:
51
- input_d: input raw data. Depending on the dataset.
52
-
53
- Returns:
54
- The parsed input. e.g. gold answer ... Depending on the dataset.
55
- """
56
- return input_d['answer']
57
-
58
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
59
- """
60
- Parse the raw model prediction (pred).
61
-
62
- Args:
63
- pred: model prediction. Depending on the model.
64
-
65
- Returns:
66
- The parsed prediction. e.g. model answer... Depending on the model.
67
- """
68
-
69
- return ResponseParser.parse_bracketed_answer(result, options=self.choices)
70
-
71
- def match(self, gold: Any, pred: Any) -> Any:
72
- """
73
- Match the gold answer with the predicted answer.
74
-
75
- Args:
76
- gold: The gold answer.
77
- pred: The predicted answer.
78
-
79
- Returns:
80
- The result of the match.
81
- """
82
- return exact_match(gold=gold, pred=pred)
30
+ self.reformat_subset = True
31
+
32
+ def record_to_sample(self, record) -> Sample:
33
+ # Extract available choices from the record
34
+ choices = []
35
+ choice_letters = ['A', 'B', 'C', 'D']
36
+ for letter in choice_letters:
37
+ if letter in record and record[letter]:
38
+ choices.append(record[letter])
39
+
40
+ return Sample(
41
+ input=record['question'],
42
+ choices=choices,
43
+ target=record['answer'],
44
+ )
45
+
46
+ def format_prompt_template(self, sample):
47
+ choices = '\n'.join([f'{chr(65 + i)}. {choice}' for i, choice in enumerate(sample.choices)])
48
+ return MARITIME_PROMPT_TEMPLATE.format(question=sample.input, choices=choices)
49
+
50
+ def extract_answer(self, prediction, task_state):
51
+ # use regex to extract the answer from the prediction
52
+ import re
53
+ match = re.search(r'\[([A-D])\]', prediction)
54
+ if match:
55
+ return match.group(1)
56
+ return ''