evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,58 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
8
- from evalscope.utils.io_utils import jsonl_to_list
9
- from evalscope.utils.logger import get_logger
10
- from .base import T2IBaseAdapter
11
-
12
- logger = get_logger()
13
-
14
-
15
- @Benchmark.register(
16
- name='genai_bench',
17
- dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
- model_adapter=OutputType.IMAGE_GENERATION,
19
- output_types=[OutputType.IMAGE_GENERATION],
20
- subset_list=['GenAI-Bench-1600'],
21
- metric_list=['VQAScore'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
25
- )
26
- class GenAIBenchAdapter(T2IBaseAdapter):
27
-
28
- def __init__(self, **kwargs):
29
- super().__init__(**kwargs)
30
-
31
- def load(self, **kwargs) -> dict:
32
- if os.path.isfile(self.dataset_id):
33
- data_list = jsonl_to_list(self.dataset_id)
34
- data_dict = {self.subset_list[0]: {'test': data_list}}
35
- return data_dict
36
- else:
37
- return super().load(**kwargs)
38
-
39
- def get_gold_answer(self, input_d: dict) -> dict:
40
- # return prompt and elements dict
41
- return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
42
-
43
- def match(self, gold: dict, pred: str) -> dict:
44
- # dummy match for general t2i
45
- # pred is the image path, gold is the prompt
46
- res = {}
47
- for metric_name, metric_func in self.metrics.items():
48
- score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
49
-
50
- res[metric_name] = score.cpu().item()
51
-
52
- # fine-granular metrics
53
- if gold['tags'].get('advanced'):
54
- res[f'{metric_name}_advanced'] = score.cpu().item()
55
- else:
56
- res[f'{metric_name}_basic'] = score.cpu().item()
57
-
58
- return res
@@ -1,58 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
8
- from evalscope.utils.io_utils import jsonl_to_list
9
- from evalscope.utils.logger import get_logger
10
- from .base import T2IBaseAdapter
11
-
12
- logger = get_logger()
13
-
14
-
15
- @Benchmark.register(
16
- name='general_t2i',
17
- dataset_id='general_t2i',
18
- model_adapter=OutputType.IMAGE_GENERATION,
19
- output_types=[OutputType.IMAGE_GENERATION],
20
- subset_list=['default'],
21
- metric_list=['PickScore'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
25
- )
26
- class GeneralT2IAdapter(T2IBaseAdapter):
27
-
28
- def __init__(self, **kwargs):
29
-
30
- super().__init__(**kwargs)
31
-
32
- def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
33
- dataset_name_or_path = dataset_name_or_path or self.dataset_id
34
- subset_list = subset_list or self.subset_list
35
-
36
- data_file_dict = defaultdict(str)
37
- data_item_dict = defaultdict(list)
38
-
39
- # get data file path and subset name
40
- if os.path.isdir(dataset_name_or_path):
41
- for subset_name in subset_list:
42
- data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
43
- elif os.path.isfile(dataset_name_or_path):
44
- cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
45
- data_file_dict[cur_subset_name] = dataset_name_or_path
46
- else:
47
- raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
48
-
49
- # load data from local disk
50
- try:
51
- for subset_name, file_path in data_file_dict.items():
52
- data_item_dict[subset_name] = jsonl_to_list(file_path)
53
- except Exception as e:
54
- raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
55
-
56
- data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
57
-
58
- return data_dict
@@ -1,57 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
8
- from evalscope.utils.io_utils import jsonl_to_list
9
- from evalscope.utils.logger import get_logger
10
- from .base import T2IBaseAdapter
11
-
12
- logger = get_logger()
13
-
14
-
15
- @Benchmark.register(
16
- name='hpdv2',
17
- dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
- model_adapter=OutputType.IMAGE_GENERATION,
19
- output_types=[OutputType.IMAGE_GENERATION],
20
- subset_list=['HPDv2'],
21
- metric_list=['HPSv2.1Score'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
25
- )
26
- class HPDv2Adapter(T2IBaseAdapter):
27
-
28
- def __init__(self, **kwargs):
29
- super().__init__(**kwargs)
30
-
31
- def load(self, **kwargs) -> dict:
32
- if os.path.isfile(self.dataset_id):
33
- data_list = jsonl_to_list(self.dataset_id)
34
- data_dict = {self.subset_list[0]: {'test': data_list}}
35
- return data_dict
36
- else:
37
- return super().load(**kwargs)
38
-
39
- def get_gold_answer(self, input_d: dict) -> dict:
40
- # return prompt and elements dict
41
- return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
42
-
43
- def match(self, gold: dict, pred: str) -> dict:
44
- # dummy match for general t2i
45
- # pred is the image path, gold is the prompt
46
- res = {}
47
- for metric_name, metric_func in self.metrics.items():
48
- score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
49
-
50
- res[metric_name] = score.cpu().item()
51
-
52
- # fine-granular metrics
53
- category = gold['tags'].get('category')
54
- if category:
55
- res[f'{metric_name}_{category}'] = score.cpu().item()
56
-
57
- return res
@@ -1,37 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
8
- from evalscope.utils.io_utils import jsonl_to_list
9
- from evalscope.utils.logger import get_logger
10
- from .base import T2IBaseAdapter
11
-
12
- logger = get_logger()
13
-
14
-
15
- @Benchmark.register(
16
- name='tifa160',
17
- dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
- model_adapter=OutputType.IMAGE_GENERATION,
19
- output_types=[OutputType.IMAGE_GENERATION],
20
- subset_list=['TIFA-160'],
21
- metric_list=['PickScore'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
25
- )
26
- class TIFA_Adapter(T2IBaseAdapter):
27
-
28
- def __init__(self, **kwargs):
29
- super().__init__(**kwargs)
30
-
31
- def load(self, **kwargs) -> dict:
32
- if os.path.isfile(self.dataset_id):
33
- data_list = jsonl_to_list(self.dataset_id)
34
- data_dict = {self.subset_list[0]: {'test': data_list}}
35
- return data_dict
36
- else:
37
- return super().load(**kwargs)
@@ -1,151 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # Copyright (c) Allen Institute, and its affiliates.
3
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
4
- """AI2 ARC (Abstraction and Reasoning Corpus) for General Artificial Intelligence Benchmark."""
5
- """AUTO GENERATED, DO NOT EDIT"""
6
-
7
- import datasets
8
- import json
9
- import os
10
-
11
- # flake8: noqa
12
-
13
- _CITATION = """\
14
- @article{allenai:arc,
15
- author = {Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and
16
- Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
17
- title = {Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
18
- journal = {arXiv:1803.05457v1},
19
- year = {2018},
20
- }
21
- """
22
-
23
- _DESCRIPTION = """\
24
- A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in
25
- advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains
26
- only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also
27
- including a corpus of over 14 million science sentences relevant to the task,
28
- and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.
29
-
30
- ARC-Easy:
31
- train: 2251
32
- test: 2376
33
- validation: 570
34
-
35
- ARC-Challenge:
36
- train: 1119
37
- test: 1172
38
- validation: 299
39
- """
40
-
41
- _URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/arc/ARC-V1-Feb2018.zip'
42
-
43
- # tasks: ['ARC-Easy', 'ARC-Challenge']
44
-
45
-
46
- class Ai2ArcConfig(datasets.BuilderConfig):
47
- """BuilderConfig for Ai2ARC."""
48
-
49
- def __init__(self, **kwargs):
50
- """BuilderConfig for Ai2Arc.
51
-
52
- Args:
53
- **kwargs: keyword arguments forwarded to super.
54
- """
55
- super(Ai2ArcConfig, self).__init__(version=datasets.Version('1.0.0', ''), **kwargs)
56
-
57
-
58
- class Ai2Arc(datasets.GeneratorBasedBuilder):
59
- """
60
- The AI2 Reasoning Challenge (ARC) dataset.
61
- Subset: ARC-Easy, ARC-Challenge.
62
- """
63
-
64
- VERSION = datasets.Version('1.0.0')
65
- BUILDER_CONFIGS = [
66
- Ai2ArcConfig(
67
- name='ARC-Challenge',
68
- description="""\
69
- Challenge Set of 2590 “hard” questions (those that both a retrieval and a co-occurrence method fail to answer correctly)
70
- """,
71
- ),
72
- Ai2ArcConfig(
73
- name='ARC-Easy',
74
- description="""\
75
- Easy Set of 5197 questions
76
- """,
77
- ),
78
- ]
79
-
80
- def _info(self):
81
- return datasets.DatasetInfo(
82
- # This is the description that will appear on the datasets page.
83
- description=_DESCRIPTION,
84
- # datasets.features.FeatureConnectors
85
- features=datasets.Features({
86
- 'id':
87
- datasets.Value('string'),
88
- 'question':
89
- datasets.Value('string'),
90
- 'choices':
91
- datasets.features.Sequence({
92
- 'text': datasets.Value('string'),
93
- 'label': datasets.Value('string')
94
- }),
95
- 'answerKey':
96
- datasets.Value('string')
97
- # These are the features of your dataset like images, labels ...
98
- }),
99
- # If there's a common (input, target) tuple from the features,
100
- # specify them here. They'll be used if as_supervised=True in
101
- # builder.as_dataset.
102
- supervised_keys=None,
103
- # Homepage of the dataset for documentation
104
- homepage='https://allenai.org/data/arc',
105
- citation=_CITATION,
106
- )
107
-
108
- def _split_generators(self, dl_manager):
109
- """Returns SplitGenerators."""
110
- # dl_manager is a datasets.download.DownloadManager that can be used to
111
- # download and extract URLs
112
- dl_dir = dl_manager.download_and_extract(_URL)
113
- data_dir = os.path.join(dl_dir, 'ARC-V1-Feb2018-2')
114
- return [
115
- datasets.SplitGenerator(
116
- name=datasets.Split.TRAIN,
117
- # These kwargs will be passed to _generate_examples
118
- gen_kwargs={'filepath': os.path.join(data_dir, self.config.name, self.config.name + '-Train.jsonl')},
119
- ),
120
- datasets.SplitGenerator(
121
- name=datasets.Split.TEST,
122
- # These kwargs will be passed to _generate_examples
123
- gen_kwargs={'filepath': os.path.join(data_dir, self.config.name, self.config.name + '-Test.jsonl')},
124
- ),
125
- datasets.SplitGenerator(
126
- name=datasets.Split.VALIDATION,
127
- # These kwargs will be passed to _generate_examples
128
- gen_kwargs={'filepath': os.path.join(data_dir, self.config.name, self.config.name + '-Dev.jsonl')},
129
- ),
130
- ]
131
-
132
- def _generate_examples(self, filepath):
133
- """Yields examples."""
134
- with open(filepath, encoding='utf-8') as f:
135
- for row in f:
136
- data = json.loads(row)
137
- answerkey = data['answerKey']
138
- id_ = data['id']
139
- question = data['question']['stem']
140
- choices = data['question']['choices']
141
- text_choices = [choice['text'] for choice in choices]
142
- label_choices = [choice['label'] for choice in choices]
143
- yield id_, {
144
- 'id': id_,
145
- 'answerKey': answerkey,
146
- 'question': question,
147
- 'choices': {
148
- 'text': text_choices,
149
- 'label': label_choices
150
- },
151
- }
@@ -1,81 +0,0 @@
1
- import copy
2
- from collections import OrderedDict
3
- from dataclasses import dataclass, field, fields
4
- from typing import TYPE_CHECKING, Dict, List, Optional
5
-
6
- from evalscope.constants import OutputType
7
-
8
- if TYPE_CHECKING:
9
- from evalscope.benchmarks import DataAdapter
10
-
11
- BENCHMARK_MAPPINGS = {}
12
-
13
-
14
- @dataclass
15
- class BenchmarkMeta:
16
- name: str
17
- dataset_id: str
18
- data_adapter: 'DataAdapter'
19
- model_adapter: Optional[str] = OutputType.GENERATION
20
- output_types: Optional[List[str]] = field(default_factory=lambda: [OutputType.GENERATION])
21
- subset_list: List[str] = field(default_factory=lambda: ['default'])
22
- metric_list: List[str] = field(default_factory=list)
23
- few_shot_num: int = 0
24
- few_shot_random: bool = False
25
- train_split: Optional[str] = None
26
- eval_split: Optional[str] = None
27
- prompt_template: Optional[str] = None
28
- system_prompt: Optional[str] = None
29
- query_template: Optional[str] = None
30
- pretty_name: Optional[str] = None
31
- description: Optional[str] = None
32
- tags: Optional[List[str]] = field(default_factory=list)
33
- filters: Optional[OrderedDict] = None
34
- extra_params: Optional[Dict] = field(default_factory=dict)
35
-
36
- def _update(self, args: dict):
37
- if args.get('local_path'):
38
- self.dataset_id = args['local_path']
39
- del args['local_path']
40
- self.__dict__.update(args)
41
-
42
- def to_dict(self) -> dict:
43
- return self.__dict__
44
-
45
- def to_string_dict(self) -> dict:
46
- cur_dict = copy.deepcopy(self.to_dict())
47
- # cur_dict['data_adapter'] = self.data_adapter.__name__
48
- del cur_dict['data_adapter']
49
- return cur_dict
50
-
51
- def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
52
- if config:
53
- self._update(config)
54
-
55
- data_adapter = self.data_adapter(**self.to_dict())
56
- return data_adapter
57
-
58
-
59
- class Benchmark:
60
-
61
- def __init__(self):
62
- pass
63
-
64
- @classmethod
65
- def get(cls, name: str) -> 'BenchmarkMeta':
66
- if name not in BENCHMARK_MAPPINGS:
67
- raise Exception(f'Unknown benchmark: {name}. Available tasks: {list(BENCHMARK_MAPPINGS.keys())}')
68
- benchmark = BENCHMARK_MAPPINGS[name]
69
- return benchmark
70
-
71
- @classmethod
72
- def register(cls, name: str, dataset_id: str, **kwargs):
73
-
74
- def register_wrapper(data_adapter):
75
- if name in BENCHMARK_MAPPINGS:
76
- raise Exception(f'Benchmark {name} already registered')
77
- BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
78
- name=name, data_adapter=data_adapter, dataset_id=dataset_id, **kwargs)
79
- return data_adapter
80
-
81
- return register_wrapper
@@ -1,146 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
- #
4
- # Licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License
5
- import datasets
6
- import os
7
- import pandas as pd
8
-
9
- # flake8: noqa
10
- """DO NOT EDIT unless you are contributing a new dataset."""
11
-
12
- _CITATION = """\
13
- @article{huang2023ceval,
14
- title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
15
- author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
16
- journal={arXiv preprint arXiv:2305.08322},
17
- year={2023}
18
- }
19
- """
20
-
21
- _DESCRIPTION = """\
22
- C-Eval is a comprehensive Chinese evaluation suite for foundation models. It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels.
23
- """
24
-
25
- _HOMEPAGE = 'https://cevalbenchmark.com'
26
-
27
- _LICENSE = 'Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License'
28
-
29
- _URL = r'https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/c-eval/ceval-exam.zip'
30
-
31
- task_list = [
32
- 'computer_network',
33
- 'operating_system',
34
- 'computer_architecture',
35
- 'college_programming',
36
- 'college_physics',
37
- 'college_chemistry',
38
- 'advanced_mathematics',
39
- 'probability_and_statistics',
40
- 'discrete_mathematics',
41
- 'electrical_engineer',
42
- 'metrology_engineer',
43
- 'high_school_mathematics',
44
- 'high_school_physics',
45
- 'high_school_chemistry',
46
- 'high_school_biology',
47
- 'middle_school_mathematics',
48
- 'middle_school_biology',
49
- 'middle_school_physics',
50
- 'middle_school_chemistry',
51
- 'veterinary_medicine',
52
- 'college_economics',
53
- 'business_administration',
54
- 'marxism',
55
- 'mao_zedong_thought',
56
- 'education_science',
57
- 'teacher_qualification',
58
- 'high_school_politics',
59
- 'high_school_geography',
60
- 'middle_school_politics',
61
- 'middle_school_geography',
62
- 'modern_chinese_history',
63
- 'ideological_and_moral_cultivation',
64
- 'logic',
65
- 'law',
66
- 'chinese_language_and_literature',
67
- 'art_studies',
68
- 'professional_tour_guide',
69
- 'legal_professional',
70
- 'high_school_chinese',
71
- 'high_school_history',
72
- 'middle_school_history',
73
- 'civil_servant',
74
- 'sports_science',
75
- 'plant_protection',
76
- 'basic_medicine',
77
- 'clinical_medicine',
78
- 'urban_and_rural_planner',
79
- 'accountant',
80
- 'fire_engineer',
81
- 'environmental_impact_assessment_engineer',
82
- 'tax_accountant',
83
- 'physician',
84
- ]
85
-
86
-
87
- class CevalExamConfig(datasets.BuilderConfig):
88
-
89
- def __init__(self, **kwargs):
90
- super().__init__(version=datasets.Version('1.0.0'), **kwargs)
91
-
92
-
93
- class CevalExam(datasets.GeneratorBasedBuilder):
94
- BUILDER_CONFIGS = [CevalExamConfig(name=task_name, ) for task_name in task_list]
95
-
96
- def _info(self):
97
- features = datasets.Features({
98
- 'id': datasets.Value('int32'),
99
- 'question': datasets.Value('string'),
100
- 'A': datasets.Value('string'),
101
- 'B': datasets.Value('string'),
102
- 'C': datasets.Value('string'),
103
- 'D': datasets.Value('string'),
104
- 'answer': datasets.Value('string'),
105
- 'explanation': datasets.Value('string'),
106
- })
107
- return datasets.DatasetInfo(
108
- description=_DESCRIPTION,
109
- features=features,
110
- homepage=_HOMEPAGE,
111
- license=_LICENSE,
112
- citation=_CITATION,
113
- )
114
-
115
- def _split_generators(self, dl_manager):
116
- data_dir = dl_manager.download_and_extract(_URL)
117
- task_name = self.config.name
118
- return [
119
- datasets.SplitGenerator(
120
- name=datasets.Split.TEST,
121
- gen_kwargs={
122
- 'filepath': os.path.join(data_dir, 'test', f'{task_name}_test.csv'),
123
- },
124
- ),
125
- datasets.SplitGenerator(
126
- name=datasets.Split('val'),
127
- gen_kwargs={
128
- 'filepath': os.path.join(data_dir, 'val', f'{task_name}_val.csv'),
129
- },
130
- ),
131
- datasets.SplitGenerator(
132
- name=datasets.Split('dev'),
133
- gen_kwargs={
134
- 'filepath': os.path.join(data_dir, 'dev', f'{task_name}_dev.csv'),
135
- },
136
- ),
137
- ]
138
-
139
- def _generate_examples(self, filepath):
140
- df = pd.read_csv(filepath, encoding='utf-8')
141
- for i, instance in enumerate(df.to_dict(orient='records')):
142
- if 'answer' not in instance.keys():
143
- instance['answer'] = ''
144
- if 'explanation' not in instance.keys():
145
- instance['explanation'] = ''
146
- yield i, instance