evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,82 @@
1
+ import os
2
+ from typing import Optional
3
+
4
+ from evalscope.constants import EvalType, FileConstants
5
+ from evalscope.utils import get_logger
6
+ from evalscope.utils.function_utils import thread_safe
7
+ from evalscope.utils.io_utils import jsonl_to_list
8
+ from .text2image_adapter import Text2ImageAdapter
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ class ImageEditAdapter(Text2ImageAdapter):
14
+ """
15
+ Support two methods:
16
+ 1. Inference using modelscope pipeline
17
+ 2. Load local inference jsonl file with key to corresponding prompt
18
+ """
19
+
20
+ def __init__(self, **kwargs):
21
+ super().__init__(**kwargs)
22
+
23
+ self.local_file = self.extra_params.get('local_file', None)
24
+ self.id_key = self.extra_params.get('id_key', FileConstants.ID)
25
+ self.image_key = self.extra_params.get('image_key', FileConstants.IMAGE_PATH)
26
+ self.local_data = self.load_local_file()
27
+
28
+ def load_local_file(self) -> Optional[dict]:
29
+ if not self.local_file:
30
+ return None
31
+
32
+ # Load file and check
33
+ data_list = jsonl_to_list(self.local_file)
34
+ data_dict = {}
35
+ for record in data_list:
36
+ if self.image_key not in record:
37
+ raise ValueError(f"Image key '{self.image_key}' not found in record: {record}, file {self.local_file}")
38
+ if self.id_key not in record:
39
+ raise ValueError(f"ID key '{self.id_key}' not found in record: {record}, file {self.local_file}")
40
+
41
+ image_path = record[self.image_key]
42
+ if not os.path.isabs(image_path):
43
+ image_path = os.path.join(os.path.dirname(self.local_file), image_path)
44
+ if not os.path.exists(image_path):
45
+ raise FileNotFoundError(f"Image file '{image_path}' not found.")
46
+
47
+ data_dict[record[self.id_key]] = record
48
+ return data_dict
49
+
50
+ def get_image_path_from_id(self, image_id) -> Optional[str]:
51
+ if not self.local_file:
52
+ return None
53
+
54
+ record = self.local_data.get(image_id)
55
+ if not record:
56
+ return None
57
+
58
+ return record[self.image_key]
59
+
60
+ def _post_process_samples(self):
61
+ super()._post_process_samples()
62
+
63
+ # Add local image path if exists
64
+ for subset in self.test_dataset.keys():
65
+ for sample in self.test_dataset[subset]:
66
+ local_image_path = self.get_image_path_from_id(sample.metadata.get(FileConstants.ID))
67
+ if local_image_path:
68
+ sample.metadata[FileConstants.IMAGE_PATH] = local_image_path
69
+
70
+ def sample_filter(self, sample) -> bool:
71
+ """
72
+ Filter samples based on metadata availability.
73
+ If local file is not available, all samples are considered valid.
74
+ Otherwise, only samples with valid metadata and image path are kept.
75
+ """
76
+ if not self.local_data:
77
+ return True
78
+ else:
79
+ sample_id = sample.metadata.get(FileConstants.ID)
80
+ if (not sample_id) or (not self.get_image_path_from_id(sample_id)):
81
+ return False
82
+ return True
@@ -0,0 +1,83 @@
1
+ from evalscope.api.dataset.dataset import Sample
2
+ from evalscope.api.evaluator import Choices, Target, TaskState
3
+ from evalscope.utils.multi_choices import (
4
+ FEW_SHOT_TEMPLATE,
5
+ MultipleChoiceTemplate,
6
+ format_example,
7
+ parse_answers,
8
+ parse_answers_zh,
9
+ prompt,
10
+ valid_template,
11
+ )
12
+ from .default_data_adapter import DefaultDataAdapter
13
+
14
+
15
+ class MultiChoiceAdapter(DefaultDataAdapter):
16
+ """
17
+ Adapter for multi-choice benchmarks.
18
+ This adapter formats the input for multi-choice questions and handles few-shot examples.
19
+ """
20
+
21
+ multiple_correct: bool = False
22
+ """Whether the benchmark allows multiple correct answers."""
23
+
24
+ def format_prompt_template(self, sample: Sample) -> str:
25
+ """
26
+ Format the basic prompt template with the sample data.
27
+
28
+ Args:
29
+ sample (Sample): The sample object containing the prompt data
30
+
31
+ Returns:
32
+ str: The formatted prompt ready for model input
33
+ """
34
+ assert valid_template(self.prompt_template), 'Prompt template is not valid'
35
+
36
+ return prompt(
37
+ question=sample.input,
38
+ choices=Choices(sample.choices),
39
+ template=self.prompt_template,
40
+ )
41
+
42
+ def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
43
+ """
44
+ Format the few-shot template with demonstrations and the main prompt.
45
+
46
+ Args:
47
+ fewshot (str): The formatted few-shot demonstration examples
48
+ sample (Sample): The sample object containing the prompt data
49
+
50
+ Returns:
51
+ str: The complete formatted input with few-shot context
52
+ """
53
+
54
+ few_shot_prompt_template = self.few_shot_prompt_template or (FEW_SHOT_TEMPLATE + self.prompt_template)
55
+
56
+ assert valid_template(few_shot_prompt_template), 'Few-shot prompt template is not valid'
57
+
58
+ return prompt(
59
+ question=sample.input, choices=Choices(sample.choices), template=few_shot_prompt_template, fewshot=fewshot
60
+ )
61
+
62
+ def sample_to_fewshot(self, sample: Sample) -> str:
63
+ """
64
+ Convert a sample to a few-shot formatted string.
65
+
66
+ Args:
67
+ sample (Sample): The sample object to format
68
+
69
+ Returns:
70
+ str: The formatted few-shot example string
71
+ """
72
+ return format_example(question=sample.input, choices=Choices(sample.choices), answer=Target(sample.target))
73
+
74
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
75
+ if self.prompt_template in [
76
+ MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
77
+ MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE
78
+ ]:
79
+ # For Chinese COT template, we use a different extraction method
80
+ answers = parse_answers_zh(task_state, multiple_correct=self.multiple_correct)
81
+ else:
82
+ answers = parse_answers(task_state, multiple_correct=self.multiple_correct)
83
+ return ''.join(sorted(list(answers)))
@@ -0,0 +1,156 @@
1
+ import base64
2
+ import os
3
+
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages.chat_message import ChatMessageUser
7
+ from evalscope.api.messages.content import ContentImage
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput
10
+ from evalscope.api.registry import get_metric
11
+ from evalscope.constants import EvalType, FileConstants
12
+ from evalscope.utils import get_logger
13
+ from evalscope.utils.function_utils import thread_safe
14
+ from .default_data_adapter import DefaultDataAdapter
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ class Text2ImageAdapter(DefaultDataAdapter):
20
+ """Text to Image Adapter for benchmarks."""
21
+
22
+ def load_from_disk(self, **kwargs):
23
+ return super().load_from_disk(use_local_loader=True)
24
+
25
+ def record_to_sample(self, record) -> Sample:
26
+ """Convert a record dictionary to a Sample object."""
27
+ return Sample(
28
+ input=[ChatMessageUser(content=record['prompt'])],
29
+ metadata={
30
+ 'prompt': record['prompt'],
31
+ 'category': record.get('category', ''),
32
+ 'tags': record.get('tags', []),
33
+ FileConstants.ID: record[FileConstants.ID],
34
+ FileConstants.IMAGE_PATH: record.get(FileConstants.IMAGE_PATH,
35
+ ''), # Optional field for existing image path
36
+ }
37
+ )
38
+
39
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
40
+ """
41
+ Hook method called during the actual inference process.
42
+
43
+ This method executes the model inference and can be overridden
44
+ to implement custom inference logic or model interaction patterns.
45
+
46
+ Args:
47
+ model (Model): The model to use for inference
48
+ sample (Sample): The sample to process
49
+
50
+ Returns:
51
+ ModelOutput: The raw output from the model
52
+ """
53
+ if self.eval_type == EvalType.MOCK_LLM:
54
+ return ModelOutput(
55
+ model=model.name,
56
+ choices=[ChatCompletionChoice.from_content('')],
57
+ )
58
+ else:
59
+ # Execute model inference with the processed input and any tools
60
+ model_output = model.generate(input=sample.input, tools=sample.tools)
61
+ return model_output
62
+
63
+ def _on_inference_end(
64
+ self, model: Model, sample: Sample, model_output: ModelOutput, output_dir: str, **kwargs
65
+ ) -> TaskState:
66
+ """
67
+ Hook method called after inference completes. Save generated images to output_dir.
68
+
69
+ Args:
70
+ model (Model): The model that performed inference
71
+ sample (Sample): The processed sample
72
+ model_output (ModelOutput): The raw model output
73
+ output_dir (str): The directory where the model output was saved
74
+
75
+ Returns:
76
+ TaskState: Complete state object for the inference task
77
+ """
78
+ if self.eval_type == EvalType.MOCK_LLM:
79
+ return TaskState(
80
+ model=model.name,
81
+ sample=sample,
82
+ messages=[model_output.message],
83
+ output=model_output,
84
+ completed=True,
85
+ )
86
+ else:
87
+ image_id = f'{sample.metadata.get(FileConstants.ID, sample.id)}_{sample.group_id}'
88
+ output_path = os.path.join(output_dir, 'images', f'{image_id}.png')
89
+ if not os.path.exists(os.path.dirname(output_path)):
90
+ os.makedirs(os.path.dirname(output_path))
91
+ # get base64 image from model_output
92
+ content = model_output.message.content[0]
93
+
94
+ assert isinstance(content, ContentImage), 'Expected ContentImage in model output'
95
+
96
+ image_base64 = content.image
97
+ with open(output_path, 'wb') as f:
98
+ f.write(base64.b64decode(image_base64))
99
+
100
+ sample.metadata[FileConstants.IMAGE_PATH] = output_path
101
+ return TaskState(
102
+ model=model.name,
103
+ sample=sample,
104
+ messages=[model_output.message],
105
+ output=model_output,
106
+ completed=True,
107
+ )
108
+
109
+ # NOTE: thread safe is needed, since we can't batch inference here.
110
+ @thread_safe
111
+ def match_score(
112
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
113
+ ) -> Score:
114
+ # Get prediction and prompt from task state
115
+ image_path = task_state.metadata.get(FileConstants.IMAGE_PATH, original_prediction)
116
+ prompt = task_state.input[0].content
117
+ meta = task_state.metadata
118
+
119
+ # Initialize the score object with prediction details
120
+ score = Score(
121
+ extracted_prediction=image_path,
122
+ prediction=image_path,
123
+ )
124
+
125
+ # Calculate scores for each configured metric
126
+ for metric in self.metric_list:
127
+ try:
128
+ if isinstance(metric, str):
129
+ metric_name = metric
130
+ metric_scorer = get_metric(metric) # Get metric implementation from registry
131
+ metric_func = metric_scorer() # Instantiate the metric scorer
132
+ elif isinstance(metric, dict):
133
+ metric_name = list(metric.keys())[0]
134
+ metric_cls = get_metric(metric_name)
135
+ metric_func = metric_cls(**metric[metric_name]) # Initialize with parameters
136
+ metric_score = metric_func(image_path, prompt)[0]
137
+
138
+ # fine-granular metrics
139
+ category = meta.get('category')
140
+ if category:
141
+ metric_name = f'{metric_name}_{category}'
142
+ if isinstance(metric_score, dict):
143
+ for k, v in metric_score.items():
144
+ score.value[f'{metric_name}_{k}'] = v.cpu().item()
145
+ else:
146
+ score.value[metric_name] = metric_score.cpu().item()
147
+ except Exception as e:
148
+ logger.error(f'Error calculating metric {metric}: {e}')
149
+ score.value[metric_name] = 0
150
+ score.metadata[metric_name] = f'error: {str(e)}'
151
+
152
+ return score
153
+
154
+ def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
155
+ # Don't add aggregation name for needle haystack adapter
156
+ return super()._on_generate_report(scores, model_name, False)
@@ -0,0 +1,6 @@
1
+ from .default_data_adapter import DefaultDataAdapter
2
+
3
+
4
+ class VisionLanguageAdapter(DefaultDataAdapter):
5
+ """Adapter for vision-language benchmarks. e.g., image captioning, visual question answering, etc."""
6
+ pass
@@ -0,0 +1,356 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import contextlib
4
+ from abc import ABC, abstractmethod
5
+ from collections import OrderedDict
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
7
+
8
+ from evalscope.api.dataset import DatasetDict, Sample
9
+ from evalscope.api.evaluator import TaskState
10
+ from evalscope.api.filter import FilterEnsemble, build_filter_ensemble
11
+ from evalscope.api.metric import AggScore, SampleScore
12
+ from evalscope.api.mixin import LLMJudgeMixin
13
+ from evalscope.api.model import Model
14
+ from evalscope.report import Report
15
+ from evalscope.utils.logger import get_logger
16
+
17
+ if TYPE_CHECKING:
18
+ from evalscope.api.benchmark import BenchmarkMeta
19
+ from evalscope.config import TaskConfig
20
+
21
+ logger = get_logger()
22
+
23
+
24
+ class DataAdapter(LLMJudgeMixin, ABC):
25
+ """
26
+ Data Adapter for the benchmark.
27
+ """
28
+
29
+ def __init__(self, benchmark_meta: 'BenchmarkMeta', task_config: Optional['TaskConfig'] = None):
30
+ self._benchmark_meta = benchmark_meta
31
+ self._task_config = task_config
32
+ super().__init__(task_config=task_config)
33
+
34
+ self.reformat_subset = False
35
+ """Whether to reformat the subset data with subset key"""
36
+
37
+ self.split_as_subset = False
38
+ """Whether to use the split name as the dataset subsets"""
39
+
40
+ self.shuffle_choices = False
41
+ """Whether to shuffle the choices in the dataset"""
42
+
43
+ self.save_metadata = True
44
+ """Whether to save metadata in the review result"""
45
+
46
+ self.category_map = {}
47
+ """Category map for the benchmark"""
48
+
49
+ self.current_subset_name = ''
50
+ """Subset name when loading datasets"""
51
+
52
+ # dataset
53
+ self.test_dataset: Optional[DatasetDict] = None
54
+ """Dataset to be evaluated"""
55
+
56
+ self.fewshot_dataset: Optional[DatasetDict] = None
57
+ """Dataset for few-shot evaluation"""
58
+
59
+ # filters
60
+ self._filter_ensemble: Optional[OrderedDict] = None
61
+
62
+ def to_dict(self) -> Dict[str, Any]:
63
+ """Convert the benchmark metadata to a dictionary."""
64
+ return self._benchmark_meta.to_string_dict()
65
+
66
+ @abstractmethod
67
+ def load_dataset(self) -> DatasetDict:
68
+ pass
69
+
70
+ @abstractmethod
71
+ def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
72
+ pass
73
+
74
+ @abstractmethod
75
+ def calculate_metrics(self, task_state: TaskState) -> SampleScore:
76
+ pass
77
+
78
+ @abstractmethod
79
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
80
+ pass
81
+
82
+ @abstractmethod
83
+ def generate_report(self, scores: Dict[str, List[AggScore]], model_name: str, output_dir: str, **kwargs) -> Report:
84
+ """
85
+ Generate a report based on the evaluation results.
86
+ """
87
+ pass
88
+
89
+ @property
90
+ def name(self) -> str:
91
+ """
92
+ Return the unique name of the benchmark.
93
+ """
94
+ return self._benchmark_meta.name
95
+
96
+ @property
97
+ def dataset_id(self) -> str:
98
+ """
99
+ Return the dataset ID or path to the benchmark.
100
+ """
101
+ return self._benchmark_meta.dataset_id
102
+
103
+ @property
104
+ def output_types(self) -> Optional[List[str]]:
105
+ """
106
+ Return the output types of the benchmark.
107
+ """
108
+ return self._benchmark_meta.output_types
109
+
110
+ @property
111
+ def limit(self) -> Optional[Union[int, float]]:
112
+ """
113
+ Return the limit for the benchmark.
114
+ """
115
+ return self._task_config.limit
116
+
117
+ @property
118
+ def repeats(self) -> int:
119
+ """
120
+ Return the number of repeats for each sample in the benchmark.
121
+ """
122
+ return self._task_config.repeats
123
+
124
+ @property
125
+ def dataset_hub(self) -> str:
126
+ """
127
+ Return the dataset hub type for the benchmark.
128
+ """
129
+ return self._task_config.dataset_hub
130
+
131
+ @dataset_hub.setter
132
+ def dataset_hub(self, value: str):
133
+ """
134
+ Set the dataset hub type for the benchmark.
135
+ """
136
+ self._task_config.dataset_hub = value
137
+
138
+ @property
139
+ def eval_type(self) -> str:
140
+ """
141
+ Return the evaluation type for the benchmark.
142
+ """
143
+ return self._task_config.eval_type
144
+
145
+ @property
146
+ def subset_list(self) -> List[str]:
147
+ """
148
+ Return the subset list of the benchmark.
149
+ """
150
+ return self._benchmark_meta.subset_list
151
+
152
+ @subset_list.setter
153
+ def subset_list(self, value: List[str]):
154
+ """
155
+ Set the subset list of the benchmark.
156
+ """
157
+ self._benchmark_meta.subset_list = value
158
+
159
+ @property
160
+ def metric_list(self) -> List[Union[str, Dict[str, Any]]]:
161
+ """
162
+ Return the metric list of the benchmark.
163
+ """
164
+ return self._benchmark_meta.metric_list
165
+
166
+ @property
167
+ def default_subset(self) -> str:
168
+ """
169
+ Return the default subset of the benchmark.
170
+ """
171
+ return self._benchmark_meta.default_subset
172
+
173
+ @default_subset.setter
174
+ def default_subset(self, value: str):
175
+ """
176
+ Set the default subset of the benchmark.
177
+ """
178
+ self._benchmark_meta.default_subset = value
179
+
180
+ @property
181
+ def few_shot_num(self) -> int:
182
+ """
183
+ Return the few shot number of the benchmark.
184
+ """
185
+ return self._benchmark_meta.few_shot_num
186
+
187
+ @few_shot_num.setter
188
+ def few_shot_num(self, value: int):
189
+ """
190
+ Set the few shot number of the benchmark.
191
+ """
192
+ self._benchmark_meta.few_shot_num = value
193
+
194
+ @property
195
+ def few_shot_random(self) -> bool:
196
+ """
197
+ Return whether few shot is random for the benchmark.
198
+ """
199
+ return self._benchmark_meta.few_shot_random
200
+
201
+ @property
202
+ def train_split(self) -> Optional[str]:
203
+ """
204
+ Return the train split of the benchmark.
205
+ """
206
+ return self._benchmark_meta.train_split
207
+
208
+ @property
209
+ def eval_split(self) -> Optional[str]:
210
+ """
211
+ Return the eval split of the benchmark.
212
+ """
213
+ return self._benchmark_meta.eval_split
214
+
215
+ @property
216
+ def prompt_template(self) -> Optional[str]:
217
+ """
218
+ Return the prompt template of the benchmark.
219
+ """
220
+ return self._benchmark_meta.prompt_template
221
+
222
+ @prompt_template.setter
223
+ def prompt_template(self, value: str):
224
+ """
225
+ Set the prompt template of the benchmark.
226
+ """
227
+ self._benchmark_meta.prompt_template = value
228
+
229
+ @property
230
+ def system_prompt(self) -> Optional[str]:
231
+ """
232
+ Return the system prompt of the benchmark.
233
+ """
234
+ return self._benchmark_meta.system_prompt
235
+
236
+ @property
237
+ def query_template(self) -> Optional[str]:
238
+ """
239
+ Return the query template of the benchmark.
240
+ """
241
+ return self._benchmark_meta.query_template
242
+
243
+ @property
244
+ def few_shot_prompt_template(self) -> Optional[str]:
245
+ """
246
+ Return the few-shot prompt template of the benchmark.
247
+ """
248
+ return self._benchmark_meta.few_shot_prompt_template
249
+
250
+ @property
251
+ def pretty_name(self) -> Optional[str]:
252
+ """
253
+ Return the pretty name of the benchmark.
254
+ """
255
+ return self._benchmark_meta.pretty_name
256
+
257
+ @property
258
+ def description(self) -> Optional[str]:
259
+ """
260
+ Return the description of the benchmark.
261
+ """
262
+ return self._benchmark_meta.description
263
+
264
+ @property
265
+ def tags(self) -> Optional[List[str]]:
266
+ """
267
+ Return the tags of the benchmark.
268
+ """
269
+ return self._benchmark_meta.tags
270
+
271
+ @property
272
+ def filters(self) -> Optional[OrderedDict]:
273
+ """
274
+ Return the filters of the benchmark.
275
+ """
276
+ return self._benchmark_meta.filters
277
+
278
+ @property
279
+ def filter_ensemble(self) -> Optional[FilterEnsemble]:
280
+ """
281
+ Return the filter ensemble of the benchmark.
282
+ """
283
+ if self._filter_ensemble is None:
284
+ if self.filters:
285
+ self._filter_ensemble = build_filter_ensemble(filters=self.filters)
286
+ return self._filter_ensemble
287
+
288
+ @property
289
+ def aggregation(self) -> str:
290
+ """
291
+ Return the aggregation function for the metrics.
292
+ """
293
+ return self._benchmark_meta.aggregation
294
+
295
+ @property
296
+ def extra_params(self) -> Optional[Dict]:
297
+ """
298
+ Return the extra parameters of the benchmark.
299
+ """
300
+ return self._benchmark_meta.extra_params
301
+
302
+ @property
303
+ def seed(self) -> Optional[int]:
304
+ """
305
+ Return the seed for the benchmark.
306
+ """
307
+ return self._task_config.seed
308
+
309
+ @property
310
+ def shuffle(self) -> bool:
311
+ """
312
+ Return whether to shuffle the dataset before evaluation.
313
+ """
314
+ return self._benchmark_meta.shuffle
315
+
316
+ @shuffle.setter
317
+ def shuffle(self, value: bool):
318
+ """
319
+ Set whether to shuffle the dataset before evaluation.
320
+ """
321
+ self._benchmark_meta.shuffle = value
322
+
323
+ @property
324
+ def shuffle_choices(self) -> bool:
325
+ """
326
+ Return whether to shuffle the choices in multiple-choice datasets.
327
+ """
328
+ return self._benchmark_meta.shuffle_choices
329
+
330
+ @shuffle_choices.setter
331
+ def shuffle_choices(self, value: bool):
332
+ """
333
+ Set whether to shuffle the choices in multiple-choice datasets.
334
+ """
335
+ self._benchmark_meta.shuffle_choices = value
336
+
337
+ @contextlib.contextmanager
338
+ def _temporary_attribute(self, attr_name: str, new_value):
339
+ """
340
+ Set a temporary value for an attribute and restore the original value after the context block.
341
+
342
+ Args:
343
+ attr_name: The name of the attribute to temporarily set.
344
+ new_value: The new value to set for the attribute.
345
+ """
346
+ had_attr = hasattr(self, attr_name)
347
+ original_value = getattr(self, attr_name, None) if had_attr else None
348
+
349
+ setattr(self, attr_name, new_value)
350
+ try:
351
+ yield
352
+ finally:
353
+ if had_attr:
354
+ setattr(self, attr_name, original_value)
355
+ else:
356
+ delattr(self, attr_name)