evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,9 +1,13 @@
1
1
  import re
2
- from collections import defaultdict
3
- from typing import Any, List
4
-
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import DEFAULT_PROMPT_TEMPLATE, LLMJudge, exact_match, mean
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
7
11
  from evalscope.utils.logger import get_logger
8
12
 
9
13
  # flake8: noqa
@@ -21,98 +25,128 @@ SUBSET_LIST = [
21
25
  'Other',
22
26
  ]
23
27
 
28
+ ANSWER_TYPE_EXACT_MATCH = 'exactMatch'
29
+ ANSWER_TYPE_MULTIPLE_CHOICE = 'multipleChoice'
30
+
31
+ # System prompt constants
32
+ SYSTEM_EXACT_ANSWER = 'Your response should be in the following format:\nExplanation: {your explanation for your final answer}\nExact Answer: {your succinct, final answer}\nConfidence: {your confidence score between 0% and 100% for your answer}'
33
+
34
+ SYSTEM_MC = 'Your response should be in the following format:\nExplanation: {your explanation for your answer choice}\nAnswer: {your chosen answer}\nConfidence: {your confidence score between 0% and 100% for your answer}'
35
+
36
+ JUDGE_PROMPT = """Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
37
+
38
+ [question]: {question}
39
+
40
+ [response]: {response}
41
+
42
+ [correct_answer]: {correct_answer}
24
43
 
25
- @Benchmark.register(
26
- name='hle',
27
- pretty_name="Humanity's-Last-Exam",
28
- tags=['Knowledge', 'QA'],
29
- description=
30
- 'Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 questions across a broad range of subjects. It was created jointly by the Center for AI Safety and Scale AI. The benchmark classifies the questions into the following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), humanities/social science (9%), computer science/artificial intelligence (10%), engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions require the ability to understand both text and images, i.e., multi-modality. 24% of the questions are multiple-choice; the rest are short-answer, exact-match questions.', # noqa: E501
31
- dataset_id='cais/hle',
32
- subset_list=SUBSET_LIST,
33
- metric_list=['AverageAccuracy'],
34
- few_shot_num=0,
35
- train_split=None,
36
- eval_split='test',
37
- prompt_template='{query}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
44
+ Your judgment must focus only on if there are meaningful differences between [correct_answer] and the [response]. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match. Explain why the [response] is correct or incorrect based on [correct_answer] in one or two sentences. Finally, write your answer in the format 'GRADE: C' for correct answer or 'GRADE: I' for incorrect answer.
45
+ """
46
+
47
+
48
+ @register_benchmark(
49
+ BenchmarkMeta(
50
+ name='hle',
51
+ pretty_name="Humanity's-Last-Exam",
52
+ tags=[Tags.KNOWLEDGE, Tags.QA],
53
+ description='Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 '
54
+ 'questions across a broad range of subjects. It was created jointly by the Center '
55
+ 'for AI Safety and Scale AI. The benchmark classifies the questions into the '
56
+ 'following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), '
57
+ 'humanities/social science (9%), computer science/artificial intelligence (10%), '
58
+ 'engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions '
59
+ 'require the ability to understand both text and images, i.e., multi-modality. '
60
+ '24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. '
61
+ 'To evaluate the performance of model without multi-modality capabilities, please set the extra_params["include_multi_modal"] to False.', # noqa: E501
62
+ dataset_id='cais/hle',
63
+ subset_list=SUBSET_LIST,
64
+ metric_list=['acc'],
65
+ eval_split='test',
66
+ prompt_template='{question}',
67
+ extra_params={'include_multi_modal': True}
68
+ )
38
69
  )
39
- class HLEAdapter(DataAdapter):
70
+ class HLEAdapter(DefaultDataAdapter):
40
71
 
41
72
  def __init__(self, *args, **kwargs):
42
73
  super().__init__(*args, **kwargs)
43
74
 
44
- self.llm_as_a_judge = True
45
-
46
- def load(self, **kwargs):
47
- kwargs['subset_list'] = ['default']
48
- data_dict = super().load(**kwargs)
49
- return self.reformat_subset(data_dict, subset_key='category', format='{}')
50
-
51
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
52
- # remove image preview
53
- input_d.pop('image_preview', None)
54
- input_d.pop('rationale_image', None)
55
- # generate prompt
56
- question = input_d['question']
57
- prompt = self.prompt_template.format(query=question)
58
- image = input_d.get('image', None)
59
- # build messages for multi-modal input
60
- messages = []
61
- if self.system_prompt:
62
- messages.append({'role': 'system', 'content': self.system_prompt})
63
- if image:
64
- messages.append({
65
- 'role':
66
- 'user',
67
- 'content': [{
68
- 'type': 'text',
69
- 'text': prompt
70
- }, {
71
- 'type': 'image_url',
72
- 'image_url': {
73
- 'url': image
74
- }
75
- }]
76
- })
77
- else:
78
- messages.append({'role': 'user', 'content': prompt})
79
- return self.gen_prompt_data(prompt='', messages=messages)
80
-
81
- def get_gold_answer(self, input_d: dict) -> str:
82
- return input_d['answer']
83
-
84
- def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
85
- # Extract the answer from the model output \boxed{answer}
86
- match = re.search(r'\\boxed{([^}]*)}', result)
87
- if match:
88
- return match.group(1).strip()
89
- else:
90
- logger.warning(f'No answer found in the model output: {result}')
91
- return ''
92
-
93
- def llm_parse_pred_result(self, result, raw_input_d=None, **kwargs) -> str:
94
- return result.strip()
95
-
96
- def match(self, gold: str, pred: str) -> dict:
97
- # simple match
98
- return {
99
- 'AverageAccuracy': 1.0 if exact_match(gold, pred) else 0.0,
100
- }
101
-
102
- def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
103
- raw_input = kwargs.get('raw_input', None)
104
- question = raw_input['question']
105
- # get grading response
106
- prompt = judge.build_prompt(pred, gold, question)
107
- judge_response = judge(prompt)
108
- score = judge.get_score(judge_response)
109
- return {
110
- 'AverageAccuracy': score,
111
- 'response': judge_response,
75
+ self._use_llm_judge = True # Use LLM as a judge by default
76
+ self.reformat_subset = True
77
+ self.include_multi_modal = self.extra_params.get('include_multi_modal', True)
78
+
79
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
80
+ answer_type = record['answer_type']
81
+ system_prompt = (SYSTEM_EXACT_ANSWER if answer_type == ANSWER_TYPE_EXACT_MATCH else SYSTEM_MC)
82
+ text_content = ContentText(text=record['question'])
83
+
84
+ content: List[Content] = [text_content]
85
+ if record['image']:
86
+ image_content = ContentImage(image=record['image'])
87
+ content.append(image_content)
88
+
89
+ messages: List[ChatMessage] = [
90
+ ChatMessageSystem(content=system_prompt),
91
+ ChatMessageUser(content=content),
92
+ ]
93
+ return Sample(
94
+ input=messages,
95
+ subset_key=record['category'],
96
+ metadata={
97
+ 'uid': record['id'],
98
+ 'author_name': record['author_name'],
99
+ 'rationale': record['rationale'],
100
+ 'raw_subject': record['raw_subject'],
101
+ 'category': record['category'],
102
+ 'has_image': bool(record['image']),
103
+ },
104
+ target=record['answer'],
105
+ )
106
+
107
+ def sample_filter(self, sample):
108
+ if not self.include_multi_modal:
109
+ if sample.metadata is not None and sample.metadata['has_image']:
110
+ return False
111
+ return True
112
+
113
+ def llm_match_score(
114
+ self,
115
+ original_prediction: str,
116
+ filtered_prediction: str,
117
+ reference: str,
118
+ task_state: TaskState,
119
+ ) -> Score:
120
+ score = Score(
121
+ extracted_prediction=filtered_prediction,
122
+ prediction=original_prediction,
123
+ )
124
+
125
+ confidence = 100
126
+ if task_state.output and task_state.output.completion:
127
+ confidence_match = re.search(r'confidence:\s*(\d+)', task_state.output.completion, re.IGNORECASE)
128
+ if confidence_match:
129
+ confidence = int(confidence_match.group(1))
130
+
131
+ judge_prompt = JUDGE_PROMPT.format(
132
+ question=task_state.input_text, response=filtered_prediction, correct_answer=reference
133
+ )
134
+
135
+ # Request judge and obtain score
136
+ judge_response = self.llm_judge.judge(prompt=judge_prompt)
137
+
138
+ # Parse judge response to get accuracy score
139
+ accuracy_score = re.search(r'GRADE:\s*([CI])', judge_response, re.IGNORECASE)
140
+ if accuracy_score:
141
+ score.value = {
142
+ 'acc': 1.0 if accuracy_score.group(1) == 'C' else 0.0,
143
+ }
144
+ score.explanation = f'LLM judge: {judge_response}'
145
+ score.metadata = {
146
+ 'source': 'llm_judge',
147
+ 'judge_strategy': self.judge_strategy,
148
+ 'model': self.llm_judge.model_id,
149
+ 'confidence': confidence,
112
150
  }
113
-
114
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
115
- # zip dict answers
116
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
117
-
118
- return super().compute_metric(res_dict, **kwargs)
151
+ score.main_score_name = 'acc'
152
+ return score
@@ -1,7 +1,15 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa: E501
2
3
  import re
3
-
4
- from evalscope.benchmarks import Benchmark, DataAdapter
4
+ from typing import Any, Dict
5
+
6
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.messages.chat_message import ChatMessageUser
10
+ from evalscope.api.metric import Score
11
+ from evalscope.api.registry import register_benchmark
12
+ from evalscope.constants import Tags
5
13
  from evalscope.utils.logger import get_logger
6
14
 
7
15
  logger = get_logger()
@@ -10,28 +18,28 @@ logger = get_logger()
10
18
  # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
11
19
 
12
20
 
13
- @Benchmark.register(
14
- name='humaneval',
15
- pretty_name='HumanEval',
16
- tags=['Coding'],
17
- description=
18
- 'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.', # noqa: E501
19
- dataset_id='modelscope/humaneval',
20
- subset_list=['openai_humaneval'],
21
- metric_list=['Pass@1'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
25
- prompt_template=
26
- 'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}', # noqa: E501
27
- extra_params={
28
- 'num_workers': 4,
29
- 'timeout': 4
30
- },
21
+ @register_benchmark(
22
+ BenchmarkMeta(
23
+ name='humaneval',
24
+ pretty_name='HumanEval',
25
+ tags=[Tags.CODING],
26
+ description=
27
+ 'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.',
28
+ dataset_id='opencompass/humaneval',
29
+ subset_list=['openai_humaneval'],
30
+ metric_list=['Pass@1'],
31
+ eval_split='test',
32
+ prompt_template=
33
+ 'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{question}',
34
+ extra_params={
35
+ 'num_workers': 4,
36
+ 'timeout': 4
37
+ },
38
+ )
31
39
  )
32
- class HumanevalAdapter(DataAdapter):
40
+ class HumanevalAdapter(DefaultDataAdapter):
33
41
  """
34
- A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
42
+ HumanEval adapter using the new data processing framework.
35
43
  """
36
44
 
37
45
  def __init__(self, **kwargs):
@@ -39,9 +47,11 @@ class HumanevalAdapter(DataAdapter):
39
47
  from human_eval.data import stream_jsonl, write_jsonl
40
48
  from human_eval.evaluation import check_correctness
41
49
  except ImportError:
42
- raise ImportError('Please install human_eval:'
43
- 'https://github.com/openai/human-eval/tree/master#installation , '
44
- 'Note that you need to enable the execution code in the human_eval/execution.py first.')
50
+ raise ImportError(
51
+ 'Please install human_eval:'
52
+ 'https://github.com/openai/human-eval/tree/master#installation , '
53
+ 'Note that you need to enable the execution code in the human_eval/execution.py first.'
54
+ )
45
55
  super().__init__(**kwargs)
46
56
 
47
57
  extra_params = kwargs.get('extra_params', {})
@@ -53,41 +63,62 @@ class HumanevalAdapter(DataAdapter):
53
63
  self.write_jsonl_func = write_jsonl
54
64
  self.eval_func = check_correctness
55
65
 
56
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
57
- data_dict = {}
58
- for subset_name in subset_list:
59
- data_dict[subset_name] = {}
60
- # [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
61
- data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
62
-
63
- return data_dict
64
-
65
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
66
- """
67
- Generate prompt for the model.
68
-
69
- Args:
70
- input_d (dict): The raw input. A single data format of the Humaneval:
71
- {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
72
- """
73
- query = input_d['prompt']
74
- full_prompt = self.prompt_template.format(query=query)
75
-
76
- return self.gen_prompt_data(full_prompt)
66
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
67
+ """Convert a data record to a Sample object."""
68
+ query = record['prompt']
69
+ full_prompt = self.prompt_template.format(question=query)
70
+
71
+ return Sample(
72
+ input=[ChatMessageUser(content=full_prompt)],
73
+ target=record['canonical_solution'],
74
+ metadata={
75
+ 'task_id': record['task_id'],
76
+ 'entry_point': record['entry_point'],
77
+ 'prompt': record['prompt'],
78
+ 'test': record['test'],
79
+ }
80
+ )
81
+
82
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
83
+ """Extract code from the prediction."""
84
+ return self._postprocess(prediction)
77
85
 
78
86
  @classmethod
79
87
  def _postprocess(cls, text: str) -> str:
88
+ """Extract code from markdown code blocks."""
80
89
  blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
81
90
  if len(blocks) >= 1:
82
91
  text = blocks[0]
83
92
  return text
84
93
 
85
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
86
- return self._postprocess(result)
87
-
88
- def get_gold_answer(self, input_d: dict) -> str:
89
- return input_d
90
-
91
- def match(self, gold: str, pred: str) -> float:
92
- res = self.eval_func(gold, pred, self.timeout)
93
- return float(res['passed'])
94
+ def match_score(
95
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
96
+ ) -> Score:
97
+ score = Score(
98
+ extracted_prediction=filtered_prediction,
99
+ prediction=original_prediction,
100
+ )
101
+
102
+ # Execute the code and check correctness
103
+ res = self.eval_func(task_state.metadata, filtered_prediction, self.timeout)
104
+ passed = res['passed']
105
+
106
+ score.value = {'pass': passed}
107
+ score.explanation = res.get('result', 'Code execution completed')
108
+ score.metadata = {'task_id': task_state.metadata['task_id'], 'timeout': self.timeout, 'execution_result': res}
109
+ score.main_score_name = 'pass'
110
+
111
+ return score
112
+
113
+ def aggregate_scores(self, sample_scores):
114
+ from evalscope.metrics.metric import PassAtK
115
+
116
+ # caculate pass@k here
117
+ agg_list = []
118
+ for metric in self.metric_list:
119
+ if metric.lower().startswith('pass@'):
120
+ k = int(metric.split('@')[1])
121
+ # Get the scores for this metric
122
+ agg = PassAtK(k)
123
+ agg_list.extend(agg(sample_scores))
124
+ return agg_list
@@ -1,54 +1,83 @@
1
- from collections import defaultdict
2
1
  from typing import Any, Dict, List
3
2
 
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType
6
- from evalscope.metrics import Metric, mean, metric_registry
7
-
8
-
9
- @Benchmark.register(
10
- name='ifeval',
11
- pretty_name='IFEval',
12
- tags=['Instruction-Following'],
13
- description=
14
- 'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
15
- dataset_id='opencompass/ifeval',
16
- subset_list=['default'],
17
- metric_list=[
18
- 'prompt_level_strict_acc',
19
- 'inst_level_strict_acc',
20
- 'prompt_level_loose_acc',
21
- 'inst_level_loose_acc',
22
- ],
23
- few_shot_num=0,
24
- train_split=None,
25
- eval_split='train',
26
- prompt_template='',
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages import ChatMessageUser
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @register_benchmark(
16
+ BenchmarkMeta(
17
+ name='ifeval',
18
+ pretty_name='IFEval',
19
+ description=
20
+ 'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
21
+ tags=[Tags.INSTRUCTION_FOLLOWING],
22
+ dataset_id='opencompass/ifeval',
23
+ subset_list=['default'],
24
+ metric_list=[
25
+ 'prompt_level_strict',
26
+ 'inst_level_strict',
27
+ 'prompt_level_loose',
28
+ 'inst_level_loose',
29
+ ],
30
+ few_shot_num=0,
31
+ train_split=None,
32
+ eval_split='train',
33
+ prompt_template='',
34
+ )
27
35
  )
28
- class IFEvalAdapter(DataAdapter):
36
+ class IFEvalAdapter(DefaultDataAdapter):
29
37
 
30
38
  def __init__(self, **kwargs):
31
39
  super().__init__(**kwargs)
32
40
 
33
- # register metrics
34
- metric_registry.register(Metric(name='prompt_level_strict_acc', object=mean))
35
- metric_registry.register(Metric(name='inst_level_strict_acc', object=mean))
36
- metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
37
- metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
41
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
42
+ """
43
+ Convert a data record to a Sample object.
38
44
 
39
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
40
- return self.gen_prompt_data(input_d['prompt'])
45
+ Args:
46
+ record (Dict[str, Any]): Input data record.
41
47
 
42
- def get_gold_answer(self, input_d: dict) -> str:
43
- return input_d
48
+ Returns:
49
+ Sample: Sample object with input, target, and metadata.
50
+ """
51
+ prompt = record.get('prompt', '')
52
+ message_list = [ChatMessageUser(content=prompt)]
44
53
 
45
- def match(self, gold: Any, pred: Any) -> Dict:
54
+ return Sample(input=message_list, target='', metadata=record)
55
+
56
+ def match_score(
57
+ self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
58
+ ) -> Score:
59
+ """
60
+ Calculate evaluation scores by comparing prediction with reference.
61
+ """
46
62
  from evalscope.benchmarks.ifeval.utils import process_results
47
63
 
48
- return process_results(gold, [pred])
64
+ # Initialize the score object with prediction details
65
+ score = Score(
66
+ extracted_prediction=filtered_prediction,
67
+ prediction=original_prediction,
68
+ )
69
+
70
+ doc = task_state.metadata
71
+ try:
72
+ # Process results using the existing ifeval utility
73
+ results = process_results(doc, [filtered_prediction])
74
+ score.value.update(results)
75
+
76
+ # Set main score name
77
+ score.main_score_name = 'prompt_level_strict'
49
78
 
50
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
51
- # aggregate review results
52
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
79
+ except Exception as e:
80
+ logger.error(f'Error calculating ifeval metrics: {e}')
81
+ score.value = {}
53
82
 
54
- return super().compute_metric(res_dict, **kwargs)
83
+ return score