evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,528 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- import random
4
- from abc import ABC, abstractmethod
5
- from collections import defaultdict
6
- from typing import Any, Dict, List, Optional, Union
7
-
8
- from evalscope.benchmarks.utils import PromptData, load_file_with_extension, preprocess_decorator
9
- from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
10
- from evalscope.metrics import LLMJudge, metric_registry
11
- from evalscope.report import Report, ReportGenerator
12
- from evalscope.utils.logger import get_logger
13
-
14
- logger = get_logger()
15
-
16
-
17
- class DataAdapter(ABC):
18
- """
19
- Data Adapter for the benchmark. You need to implement the following methods:
20
- - gen_prompt
21
- - get_gold_answer
22
- - parse_pred_result
23
- - match
24
- """
25
-
26
- def __init__(self,
27
- name: str,
28
- dataset_id: str,
29
- model_adapter: str,
30
- subset_list: list,
31
- metric_list: List[str],
32
- llm_as_a_judge: bool = False,
33
- output_types: Optional[List[str]] = None,
34
- few_shot_num: Optional[int] = 0,
35
- train_split: Optional[str] = None,
36
- eval_split: Optional[str] = None,
37
- prompt_template: Optional[str] = None,
38
- system_prompt: Optional[str] = None,
39
- query_template: Optional[str] = None,
40
- pretty_name: Optional[str] = None,
41
- description: Optional[str] = None,
42
- tags: Optional[List[str]] = None,
43
- **kwargs):
44
- """
45
- Args:
46
- name: str, the name of the benchmark.
47
- dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
48
- model_adapter: str, the model adapter to use for the benchmark.
49
- subset_list: list of subset names for the dataset.
50
- metric_list: list, the metric list to evaluate the model on specific benchmark.
51
- llm_as_a_judge: bool, whether to use LLM as a judge to evaluate the predicted answer against the gold answer.
52
- output_types: list, the output types of the model adapter. Default: [model_adapter]
53
- few_shot_num: int, number of few-shot examples. Default: 0
54
- train_split: str, usually for few-shot examples. e.g. 'train'
55
- eval_split: str, the target eval split name. e.g. 'test'
56
- prompt_template: str, the prompt template for the benchmark,
57
- e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
58
- the form of A or B or C or D, do not output explanation:`
59
- system_prompt: str, the system prompt for the benchmark, e.g. 'You are a helpful assistant.'
60
- query_template: str, the query template for the benchmark, e.g. 'Please answer the following question: {}'
61
- pretty_name: str, the pretty name of the benchmark, e.g. 'ARC Challenge Set'.
62
- description: str, the description of the benchmark,
63
- e.g. 'ARC Challenge Set is a benchmark for evaluating reasoning abilities of models on science questions.'
64
- """ # noqa: E501
65
- self.name = name
66
- self.dataset_id = dataset_id
67
- self.model_adapter = model_adapter
68
- self.subset_list = subset_list
69
- self.metric_list = metric_list
70
- self.llm_as_a_judge = llm_as_a_judge
71
- self.output_types = output_types or [model_adapter]
72
- self.few_shot_num = few_shot_num
73
- self.train_split = train_split
74
- self.eval_split = eval_split
75
- self.prompt_template = prompt_template
76
- self.system_prompt = system_prompt
77
- self.query_template = query_template
78
- self.pretty_name = pretty_name
79
- self.description = description
80
- self.tags = tags or []
81
- self.config_kwargs = kwargs
82
- self.category_map = kwargs.get('category_map', {})
83
- self.choices = kwargs.get('choices', None)
84
-
85
- def __init_subclass__(cls, **kwargs):
86
- super().__init_subclass__(**kwargs)
87
-
88
- # find and decorate parse_pred_result method
89
- if hasattr(cls, 'parse_pred_result'):
90
- original_method = cls.parse_pred_result
91
- cls.parse_pred_result = preprocess_decorator(original_method)
92
-
93
- def load(self,
94
- dataset_name_or_path: str = None,
95
- subset_list: list = None,
96
- work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
97
- **kwargs) -> dict:
98
- """
99
- Load the dataset. Remote and local datasets are supported.
100
- You can rewrite this method to support your own local dataset, just follow the format of the output.
101
-
102
- Returns: {'subset_name': {'train': train_dataset, 'test': test_dataset}}
103
- train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.
104
-
105
- """
106
- dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
107
- subset_list = subset_list or self.subset_list
108
-
109
- # Try to load dataset from local disk
110
- if os.path.exists(dataset_name_or_path):
111
- logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
112
- trust_remote_code = kwargs.pop('trust_remote_code', False)
113
- data_dict = self.load_from_disk(
114
- dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
115
- else:
116
- logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
117
- trust_remote_code = kwargs.pop('trust_remote_code', True)
118
- data_dict = self.load_from_hub(
119
- dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
120
- if len(data_dict) == 0:
121
- raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
122
- return data_dict
123
-
124
- def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
125
- from modelscope.msdatasets import MsDataset
126
-
127
- datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
128
- split_as_subset: bool = kwargs.pop('split_as_subset', False)
129
- # Load dataset from remote
130
- logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
131
-
132
- data_dict = {}
133
- split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
134
- if len(split_list) == 0:
135
- logger.error(f'Got empty split list: {split_list}')
136
-
137
- if split_as_subset:
138
- for sub_name in subset_list:
139
- data_dict[sub_name] = {}
140
- # e.g. train: few-shot, test: target dataset to evaluate
141
- for split in split_list:
142
- dataset = MsDataset.load(
143
- dataset_name=dataset_name_or_path,
144
- split=sub_name, # load subset from split
145
- cache_dir=work_dir,
146
- hub=datasets_hub,
147
- **kwargs)
148
- data_dict[sub_name].update({split: dataset})
149
- else:
150
- for sub_name in subset_list:
151
- data_dict[sub_name] = {}
152
- # e.g. train: few-shot, test: target dataset to evaluate
153
- for split in split_list:
154
- dataset = MsDataset.load(
155
- dataset_name=dataset_name_or_path,
156
- subset_name=sub_name,
157
- split=split,
158
- cache_dir=work_dir,
159
- hub=datasets_hub,
160
- **kwargs)
161
- data_dict[sub_name].update({split: dataset})
162
-
163
- return data_dict
164
-
165
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
166
- """
167
- Load the dataset from local disk.
168
- If you want to support local dataset, please rewrite this method in xxx_data_adapter.
169
- Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
170
- """
171
- # remove dataset_infos.json file if exists, since MsDataset will occur an error if it exists.
172
- dataset_infos_path = os.path.join(dataset_name_or_path, 'dataset_infos.json')
173
- if os.path.exists(dataset_infos_path):
174
- logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid MsDataset errors.')
175
- os.remove(dataset_infos_path)
176
- return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
177
-
178
- def load_with_snapshot(self,
179
- file_structure: Dict[str, List[str]],
180
- dataset_name_or_path: str = None,
181
- subset_list: list = None,
182
- work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
183
- **kwargs) -> dict:
184
- """
185
- For datasets that cannot be correctly loaded using MsDataset, utilize snapshot downloading to load the data.
186
- This feature supports both remote and local datasets.
187
-
188
- Args:
189
- file_structure: dict, the file structure of the dataset, e.g. {'subset_name': ['file1.jsonl', 'file2.jsonl']}.
190
- dataset_name_or_path: str, the dataset id on ModelScope or local path for the benchmark.
191
- subset_list: list of subset names for the dataset.
192
- work_dir: str, the working directory to store the dataset.
193
- Returns: {'subset_name': {'eval': eval_dataset}}
194
- """ # noqa: E501
195
- dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
196
- subset_list = subset_list or self.subset_list
197
-
198
- # Try to load dataset from local disk
199
- if os.path.exists(dataset_name_or_path):
200
- logger.info(f'Loading dataset from {dataset_name_or_path}')
201
- dataset_path = dataset_name_or_path
202
- else:
203
- from modelscope import dataset_snapshot_download
204
-
205
- # Load dataset from remote
206
- logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
207
- # flatten file structure
208
- file_names = [file for sub_files in file_structure.values() for file in sub_files]
209
- # download dataset snapshot
210
- dataset_path = dataset_snapshot_download(
211
- dataset_name_or_path, cache_dir=work_dir, allow_file_pattern=file_names)
212
- # read and process files
213
- data_dict = defaultdict(dict)
214
- for sub_name in subset_list:
215
- file_paths = [os.path.join(dataset_path, file_name) for file_name in file_structure[sub_name]]
216
- # not train split, only eval split
217
- data_dict[sub_name][self.eval_split] = load_file_with_extension(file_paths)
218
-
219
- return data_dict
220
-
221
- def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
222
- """
223
- Reformat the dataset subset with subset_key and format.
224
- """
225
- res_dict: dict = defaultdict(lambda: defaultdict(list), {key: defaultdict(list) for key in self.subset_list})
226
-
227
- for sub_name, sub_data_dict in data_dict.items():
228
- for split in [self.train_split, self.eval_split]:
229
- if split is None:
230
- continue
231
- for sample_d in sub_data_dict[split]:
232
- new_subset_name = format.format(sample_d[subset_key])
233
- if new_subset_name not in self.subset_list:
234
- continue
235
- res_dict[new_subset_name][split].append(sample_d)
236
- return res_dict
237
-
238
- def gen_prompts(self, data_dict: dict) -> dict:
239
- """
240
- Generate dataset prompts from raw input, unify the prompt format for different datasets.
241
-
242
- Args:
243
- data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
244
-
245
- Returns:
246
- {'subset_name': [prompt_d_1, prompt_d_2, ...]}
247
- prompt_d_i (dict): refer to the output of gen_prompt method.
248
-
249
- e.g. train -- few-shot data, test -- target dataset to evaluate.
250
- """
251
- res_dict: dict = {}
252
-
253
- if self.few_shot_num and self.few_shot_num < 0:
254
- raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
255
-
256
- logger.info(f'Use settings: '
257
- f'> few_shot_num: {self.few_shot_num}, '
258
- f'> few_shot_split: {self.train_split}, '
259
- f'> target_eval_split: {self.eval_split}')
260
-
261
- for sub_name, sub_data_dict in data_dict.items():
262
- few_shot_data = []
263
- if self.train_split and self.few_shot_num and self.few_shot_num > 0:
264
- few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
265
- few_shot_data = self.get_fewshot_examples([item for item in sub_data_dict[self.train_split]],
266
- self.few_shot_num,
267
- few_shot_random=few_shot_random)
268
-
269
- res_dict[sub_name] = []
270
- for sample_d in sub_data_dict[self.eval_split]:
271
- prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data)
272
- prompt_d[AnswerKeys.RAW_INPUT] = sample_d
273
- res_dict[sub_name].append(prompt_d)
274
-
275
- return res_dict
276
-
277
- def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
278
-
279
- if k > len(data_list):
280
- k = len(data_list)
281
- if few_shot_random:
282
- return random.sample(data_list, k)
283
- else:
284
- return data_list[:k]
285
-
286
- def compute_metric(self, review_res_list: Union[dict, list], **kwargs) -> List[dict]:
287
- """
288
- Compute evaluation result by specific metrics.
289
-
290
- Args:
291
- review_res_list: list, the review result list, each item of which is match result for gold and pred.
292
-
293
- Returns:
294
- Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
295
- """
296
- if len(self.metric_list) == 0:
297
- raise ValueError('No metric list found for the benchmark.')
298
-
299
- res_list = []
300
- for metric_str in self.metric_list:
301
- metric = metric_registry.get(metric_str)
302
- metric_name = metric.name
303
- metric_func = metric.object
304
- if isinstance(review_res_list, dict):
305
- review_res = review_res_list.get(metric_name, [])
306
- else:
307
- review_res = review_res_list
308
- res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
309
- return res_list
310
-
311
- def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
312
- **kwargs) -> Dict[str, List[float]]:
313
- """
314
- compute weighted mean of score of all samples
315
-
316
- Args:
317
- review_res_list: [score1, score2, ...]
318
-
319
- Returns:
320
- avg_res: Dict[str, List[float]]
321
-
322
- """
323
- if len(review_res_list) > 0 and isinstance(review_res_list[0], list):
324
- review_res_list = [item for sublist in review_res_list for item in sublist]
325
-
326
- items = defaultdict(list)
327
- for scores in review_res_list:
328
- if isinstance(scores, dict):
329
- for k, v in scores.items():
330
- items[k].append(v)
331
- else:
332
- items['AverageAccuracy'].append(scores)
333
- return items
334
-
335
- def gen_report(self, subset_score_map: dict, model_name: str, **kwargs) -> Report:
336
- """
337
- Generate report for the evaluation results for all subsets.
338
-
339
- Args:
340
- subset_score_map: The subset-score map.
341
- e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
342
-
343
- model_name: The evaluation model name.
344
-
345
- Returns: The evaluation report.
346
-
347
- Here is a format example for gsm8k:
348
- {
349
- "name": "qwen2.5_gsm8k",
350
- "metrics": [
351
- {
352
- "name": "AverageAccuracy",
353
- "categories": [
354
- {
355
- "name": "default",
356
- "subsets": [
357
- {
358
- "name": "main",
359
- "score": 0.0,
360
- "num": 2
361
- }
362
- ],
363
- "num": 2,
364
- "score": 0.0,
365
- "macro_score": 0.0
366
- }
367
- ],
368
- "num": 2,
369
- "score": 0.0,
370
- "macro_score": 0.0
371
- }
372
- ],
373
- "dataset_name": "gsm8k",
374
- "model_name": "qwen2.5"
375
- }
376
- """ # noqa: E501
377
- return ReportGenerator.gen_report(subset_score_map, model_name, data_adapter=self, **kwargs)
378
-
379
- def post_process_report(self, report: Report, **kwargs):
380
- """
381
- Post-process the report after generation. Draw a chart, save to file, etc.
382
- This method can be overridden to customize the report format or content.
383
-
384
- Args:
385
- report (Report): The generated report.
386
- """
387
- pass
388
-
389
- def gen_prompt_data(self,
390
- prompt: str = '',
391
- system_prompt: Optional[str] = None,
392
- choices: Optional[List[str]] = None,
393
- index: Optional[Union[int, str]] = None,
394
- id: Optional[Union[int, str]] = None,
395
- messages: Optional[List[dict]] = None,
396
- **kwargs) -> dict:
397
- """
398
- Generates a dictionary representation of prompt data for evaluation or inference.
399
-
400
- Args:
401
- prompt (str): The main prompt or input text. Can also be a list of prompts.
402
- system_prompt (Optional[str], optional): An optional system-level prompt to provide context or instructions. Defaults to None.
403
- choices (Optional[List[str]], optional): A list of possible choices for multi-choice tasks.
404
- If not provided, uses self.choices. Defaults to None.
405
- index (Optional[Union[int, str]], optional): An optional index or identifier for the prompt.
406
- Defaults to 0 if not provided. Defaults to None.
407
- id (Optional[Union[int, str]], optional): An optional unique identifier for the prompt data. Defaults to None.
408
- messages (Optional[List[dict]], optional): An optional list of message dictionaries, typically for chat-based prompts. Defaults to None.
409
- If messages is provided, it will be used as the prompt data instead of the prompt string.
410
-
411
- Returns:
412
- dict: A dictionary representation of the prompt data, suitable for further processing or model input.
413
- """ # noqa: E501
414
- data = [prompt] if not isinstance(prompt, list) else prompt
415
- prompt_data = PromptData(
416
- data=data,
417
- multi_choices=choices or self.choices,
418
- system_prompt=system_prompt or self.system_prompt,
419
- index=index or 0,
420
- id=id,
421
- messages=messages,
422
- extra_data=kwargs.get('extra_data', None))
423
- return prompt_data.to_dict()
424
-
425
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
426
- """
427
- Generate model prompt from raw input, unify the prompt format for different datasets.
428
- The input format is compatible with OpenAI Chat Completions APIs.
429
-
430
- Args:
431
- input_d (Any): The raw input. Depending on the dataset.
432
- subset_name (str): The subset name.
433
- few_shot_list (list): The few-shot examples.
434
-
435
- Returns:
436
- For class ChatGenerationModelAdapter, the output format is:
437
- {'data': [full_prompt], 'system_prompt': (str, optional)}, -- full_prompt: str, the constructed prompt for each sample from dataset.
438
- For class MultiChoiceModelAdapter, the output format is:
439
- {'data': [full_prompt], 'multi_choices': self.choices} -- full_prompt: str, the constructed prompt for each sample from dataset.
440
- For class ContinuationEvalModelAdapter, the output format is:
441
- {'data': ctx_continuation_pair_list, 'multi_choices': self.choices} -- ctx_continuation_pair_list: list, the context-continuation pair list.
442
- """ # noqa: E501
443
- raise NotImplementedError
444
-
445
- @abstractmethod
446
- def get_gold_answer(self, input_d: Any) -> Any:
447
- """
448
- Parse the raw input labels (gold).
449
-
450
- Args:
451
- input_d: input raw data. Depending on the dataset.
452
-
453
- Returns:
454
- The parsed input. e.g. gold answer ... Depending on the dataset.
455
- """
456
- raise NotImplementedError
457
-
458
- def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
459
- """
460
- Parse the predicted result and extract proper answer.
461
-
462
- Args:
463
- result: Predicted answer from the model. Usually a string for chat.
464
- raw_input_d: The raw input. Depending on the dataset.
465
- eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
466
-
467
- Returns:
468
- The parsed answer. Depending on the dataset. Usually a string for chat.
469
- """
470
- return result
471
-
472
- def llm_parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
473
- """
474
- Parse the predicted result using LLM.
475
-
476
- Args:
477
- result (Any): The predicted answer from the model.
478
- raw_input_d (dict): The raw input data.
479
- eval_type (str): The evaluation type, default is 'checkpoint'.
480
-
481
- Returns:
482
- The parsed answer. Usually a string for chat.
483
- """
484
- return result
485
-
486
- def match(self, gold: Any, pred: Any) -> Any:
487
- """
488
- Match the gold answer and the predicted answer.
489
-
490
- Args:
491
- gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
492
- e.g. 'A', extracted from get_gold_answer method.
493
- pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
494
- e.g. 'B', extracted from parse_pred_result method.
495
-
496
- Returns:
497
- The match result. Usually a score (float) for chat/multiple-choice-questions.
498
- """
499
- return 1.0 if gold == pred else 0.0
500
-
501
- def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
502
- """
503
- Use LLM as a judge to evaluate the predicted answer against the gold answer.
504
-
505
- Args:
506
- gold (Any): The golden answer.
507
- pred (Any): The predicted answer.
508
-
509
- Returns:
510
- The match result as a float score between 0 and 1.
511
- """
512
- # Default judge handling
513
- if judge is None:
514
- logger.warning('No judge LLM provided, please specify a judge LLM in the config.')
515
- return 0
516
-
517
- # Extract question from raw_input if available
518
- raw_input = kwargs.get('raw_input', {})
519
- question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
520
- # Find the first non-empty question key in raw_input
521
- question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
522
-
523
- # Request judge and obtain score
524
- prompt = judge.build_prompt(pred, gold, question)
525
- judge_response = judge(prompt)
526
- score = judge.get_score(judge_response)
527
-
528
- return score
@@ -1,59 +0,0 @@
1
- import re
2
- from typing import Any, Callable, Dict
3
-
4
-
5
- class Filter:
6
- """
7
- A base Filter class that implements the registry pattern
8
- """
9
- _registry: Dict[str, Callable[[str, Any], str]] = {}
10
-
11
- @classmethod
12
- def register(cls, name: str) -> Callable:
13
- """
14
- Decorator to register a new filter function
15
- """
16
-
17
- def decorator(func: Callable[[str, Any], str]) -> Callable[[str, Any], str]:
18
- cls._registry[name] = func
19
- return func
20
-
21
- return decorator
22
-
23
- @classmethod
24
- def get_filter(cls, name: str) -> Callable:
25
- """
26
- Get a registered filter by name
27
- """
28
- return cls._registry.get(name)
29
-
30
- @classmethod
31
- def apply(cls, name: str, value: str, *args, **kwargs) -> str:
32
- """
33
- Apply a registered filter to a value
34
- """
35
- filter_func = cls.get_filter(name)
36
- if filter_func is None:
37
- raise ValueError(f'Filter {name} not found')
38
- return filter_func(value, *args, **kwargs)
39
-
40
-
41
- @Filter.register('remove_until')
42
- def remove_until(value: str, marker: str) -> str:
43
- """
44
- Remove everything before the last occurrence of marker
45
- """
46
- if marker not in value:
47
- return value
48
- return value[value.rindex(marker) + len(marker):]
49
-
50
-
51
- @Filter.register('extract')
52
- def extract(value: str, pattern: str) -> str:
53
- """
54
- Extract content from string using regex pattern
55
- """
56
- match = re.search(pattern, value)
57
- if match:
58
- return match.group(0)
59
- return ''