evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,3 +1,11 @@
1
- from .adapters import DefaultDataAdapter, MultiChoiceAdapter, Text2ImageAdapter
1
+ from .adapters import (
2
+ AgentAdapter,
3
+ DefaultDataAdapter,
4
+ ImageEditAdapter,
5
+ MultiChoiceAdapter,
6
+ NERAdapter,
7
+ Text2ImageAdapter,
8
+ VisionLanguageAdapter,
9
+ )
2
10
  from .benchmark import DataAdapter
3
11
  from .meta import BenchmarkMeta
@@ -1,3 +1,7 @@
1
+ from .agent_adapter import AgentAdapter
1
2
  from .default_data_adapter import DefaultDataAdapter
3
+ from .image_edit_adapter import ImageEditAdapter
2
4
  from .multi_choice_adapter import MultiChoiceAdapter
5
+ from .ner_adapter import NERAdapter
3
6
  from .text2image_adapter import Text2ImageAdapter
7
+ from .vision_language_adapter import VisionLanguageAdapter
@@ -0,0 +1,8 @@
1
+ from .default_data_adapter import DefaultDataAdapter
2
+
3
+
4
+ class AgentAdapter(DefaultDataAdapter):
5
+ """Adapter for agent benchmarks. e.g., function calling, etc."""
6
+
7
+ def __init__(self, **kwargs):
8
+ super().__init__(**kwargs)
@@ -2,6 +2,7 @@ import os
2
2
  from collections import defaultdict
3
3
  from functools import partial
4
4
  from overrides import override
5
+ from tqdm.auto import tqdm
5
6
  from typing import Any, Callable, Dict, List, Optional, Tuple, Type
6
7
 
7
8
  from evalscope.api.dataset import DataLoader, Dataset, DatasetDict, LocalDataLoader, RemoteDataLoader, Sample
@@ -128,6 +129,9 @@ class DefaultDataAdapter(DataAdapter):
128
129
  for sample in self.test_dataset[subset]:
129
130
  if isinstance(sample.input, str):
130
131
  sample.input = self.process_sample_str_input(sample, subset)
132
+ elif isinstance(sample.input, list):
133
+ # Handle list[ChatMessage] and add system prompt if needed
134
+ sample.input = self.process_sample_messages_input(sample, subset)
131
135
 
132
136
  def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
133
137
  """
@@ -142,6 +146,15 @@ class DefaultDataAdapter(DataAdapter):
142
146
  input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
143
147
  return input_messages
144
148
 
149
+ def process_sample_messages_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
150
+ """
151
+ Normalize a sample's existing List[ChatMessage] input and ensure system prompt is set once.
152
+ """
153
+ messages = list(sample.input) # shallow copy to avoid in-place mutations
154
+ if self.system_prompt and not any(isinstance(m, ChatMessageSystem) for m in messages):
155
+ messages = [ChatMessageSystem(content=self.system_prompt)] + messages
156
+ return messages
157
+
145
158
  def process_sample_input(self, sample: Sample, subset: str) -> str:
146
159
  """
147
160
  Process a single sample's input by applying prompt templates and few-shot formatting.
@@ -241,6 +254,7 @@ class DefaultDataAdapter(DataAdapter):
241
254
  filter_func=self.sample_filter,
242
255
  limit=self.limit if not self.reformat_subset else None, # Limit number of samples if specified
243
256
  repeats=self.repeats, # Number of repetitions for each sample
257
+ shuffle=self.shuffle, # Shuffle dataset if enabled
244
258
  shuffle_choices=self.shuffle_choices, # Shuffle choices if requested
245
259
  data_source=self.dataset_hub, # Data source configuration
246
260
  )
@@ -599,6 +613,61 @@ class DefaultDataAdapter(DataAdapter):
599
613
 
600
614
  return sample_score
601
615
 
616
+ def batch_match_score(
617
+ self, original_predictions: List[str], filtered_predictions: List[str], references: List[str],
618
+ task_states: List[TaskState]
619
+ ) -> Optional[List[Score]]:
620
+ """
621
+ Batch calculate evaluation scores by comparing predictions with references.
622
+
623
+ This method computes scores using all configured metrics for a batch of samples
624
+ and creates a list of Score objects with detailed evaluation results.
625
+
626
+ Args:
627
+ original_predictions (List[str]): The original, unfiltered model predictions
628
+ filtered_predictions (List[str]): The filtered and processed predictions
629
+ references (List[str]): The ground truth reference answers
630
+ task_states (List[TaskState]): The complete task states for context
631
+
632
+ Returns:
633
+ List[Score]: List of objects containing all calculated metric scores and metadata
634
+ """
635
+ return None # Default implementation does not support batch scoring
636
+
637
+ @override
638
+ def batch_calculate_metrics(self, task_states: List[TaskState],
639
+ sample_scores: List[SampleScore]) -> List[SampleScore]:
640
+ """Batch calculate metrics for a list of task states with tqdm progress and batch processing."""
641
+ total = len(task_states)
642
+ if total == 0:
643
+ return sample_scores
644
+
645
+ # Prepare lists for batch processing
646
+ original_predictions: List[str] = []
647
+ filtered_predictions: List[str] = []
648
+ references: List[str] = []
649
+
650
+ for ts in task_states:
651
+ pred = ts.output.completion
652
+ original_predictions.append(pred)
653
+ filtered_predictions.append(self.filter_prediction(pred, ts))
654
+ references.append(ts.target)
655
+
656
+ batch_scores = self.batch_match_score(
657
+ original_predictions=original_predictions,
658
+ filtered_predictions=filtered_predictions,
659
+ references=references,
660
+ task_states=task_states
661
+ )
662
+
663
+ if batch_scores is not None:
664
+ assert len(batch_scores) == len(sample_scores), \
665
+ 'Batch scores length must match sample scores length.'
666
+ for batch_score, sample_score in zip(batch_scores, sample_scores):
667
+ sample_score.score.value.update(batch_score.value)
668
+
669
+ return sample_scores
670
+
602
671
  @override
603
672
  def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
604
673
  """
@@ -641,9 +710,7 @@ class DefaultDataAdapter(DataAdapter):
641
710
  """
642
711
  pass
643
712
 
644
- def _on_generate_report(
645
- self, scores: Dict[str, List[AggScore]], model_name: str, add_aggregation_name: bool = True
646
- ) -> Report:
713
+ def _on_generate_report(self, scores: Dict[str, List[AggScore]], model_name: str) -> Report:
647
714
  """
648
715
  Hook method called during report generation.
649
716
 
@@ -659,7 +726,7 @@ class DefaultDataAdapter(DataAdapter):
659
726
  Report: The generated evaluation report
660
727
  """
661
728
  return ReportGenerator.generate_report(
662
- score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=add_aggregation_name
729
+ score_dict=scores, model_name=model_name, data_adapter=self, add_aggregation_name=self.add_aggregation_name
663
730
  )
664
731
 
665
732
  @override
@@ -681,3 +748,7 @@ class DefaultDataAdapter(DataAdapter):
681
748
  report = self._on_generate_report(scores, model_name=model_name)
682
749
  self._on_generate_report_end(report, output_dir, **kwargs)
683
750
  return report
751
+
752
+ def finalize(self, *args, **kwargs):
753
+ # Finalize the evaluation process
754
+ self.sandbox_finalize(*args, **kwargs)
@@ -0,0 +1,82 @@
1
+ import os
2
+ from typing import Optional
3
+
4
+ from evalscope.constants import EvalType, FileConstants
5
+ from evalscope.utils import get_logger
6
+ from evalscope.utils.function_utils import thread_safe
7
+ from evalscope.utils.io_utils import jsonl_to_list
8
+ from .text2image_adapter import Text2ImageAdapter
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ class ImageEditAdapter(Text2ImageAdapter):
14
+ """
15
+ Support two methods:
16
+ 1. Inference using modelscope pipeline
17
+ 2. Load local inference jsonl file with key to corresponding prompt
18
+ """
19
+
20
+ def __init__(self, **kwargs):
21
+ super().__init__(**kwargs)
22
+
23
+ self.local_file = self.extra_params.get('local_file', None)
24
+ self.id_key = self.extra_params.get('id_key', FileConstants.ID)
25
+ self.image_key = self.extra_params.get('image_key', FileConstants.IMAGE_PATH)
26
+ self.local_data = self.load_local_file()
27
+
28
+ def load_local_file(self) -> Optional[dict]:
29
+ if not self.local_file:
30
+ return None
31
+
32
+ # Load file and check
33
+ data_list = jsonl_to_list(self.local_file)
34
+ data_dict = {}
35
+ for record in data_list:
36
+ if self.image_key not in record:
37
+ raise ValueError(f"Image key '{self.image_key}' not found in record: {record}, file {self.local_file}")
38
+ if self.id_key not in record:
39
+ raise ValueError(f"ID key '{self.id_key}' not found in record: {record}, file {self.local_file}")
40
+
41
+ image_path = record[self.image_key]
42
+ if not os.path.isabs(image_path):
43
+ image_path = os.path.join(os.path.dirname(self.local_file), image_path)
44
+ if not os.path.exists(image_path):
45
+ raise FileNotFoundError(f"Image file '{image_path}' not found.")
46
+
47
+ data_dict[record[self.id_key]] = record
48
+ return data_dict
49
+
50
+ def get_image_path_from_id(self, image_id) -> Optional[str]:
51
+ if not self.local_file:
52
+ return None
53
+
54
+ record = self.local_data.get(image_id)
55
+ if not record:
56
+ return None
57
+
58
+ return record[self.image_key]
59
+
60
+ def _post_process_samples(self):
61
+ super()._post_process_samples()
62
+
63
+ # Add local image path if exists
64
+ for subset in self.test_dataset.keys():
65
+ for sample in self.test_dataset[subset]:
66
+ local_image_path = self.get_image_path_from_id(sample.metadata.get(FileConstants.ID))
67
+ if local_image_path:
68
+ sample.metadata[FileConstants.IMAGE_PATH] = local_image_path
69
+
70
+ def sample_filter(self, sample) -> bool:
71
+ """
72
+ Filter samples based on metadata availability.
73
+ If local file is not available, all samples are considered valid.
74
+ Otherwise, only samples with valid metadata and image path are kept.
75
+ """
76
+ if not self.local_data:
77
+ return True
78
+ else:
79
+ sample_id = sample.metadata.get(FileConstants.ID)
80
+ if (not sample_id) or (not self.get_image_path_from_id(sample_id)):
81
+ return False
82
+ return True
@@ -18,8 +18,11 @@ class MultiChoiceAdapter(DefaultDataAdapter):
18
18
  This adapter formats the input for multi-choice questions and handles few-shot examples.
19
19
  """
20
20
 
21
- multiple_correct: bool = False
22
- """Whether the benchmark allows multiple correct answers."""
21
+ def __init__(self, **kwargs):
22
+ super().__init__(**kwargs)
23
+
24
+ self.multiple_correct: bool = False
25
+ """Whether the benchmark allows multiple correct answers."""
23
26
 
24
27
  def format_prompt_template(self, sample: Sample) -> str:
25
28
  """
@@ -0,0 +1,212 @@
1
+ from typing import Any, Dict, List, Set, Tuple
2
+
3
+ from evalscope.api.dataset import Sample
4
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
5
+ from evalscope.utils.import_utils import check_import
6
+ from evalscope.utils.logger import get_logger
7
+ from evalscope.utils.ner import (
8
+ DEFAULT_TAG_FIX_PATTERNS,
9
+ calculate_bio_metrics,
10
+ clean_prediction,
11
+ create_target_text,
12
+ extract_entities_from_text,
13
+ extract_spans_from_bio,
14
+ xml_to_bio_tags,
15
+ )
16
+ from .default_data_adapter import DefaultDataAdapter
17
+
18
+ logger = get_logger()
19
+
20
+
21
+ class NERAdapter(DefaultDataAdapter):
22
+ """
23
+ Base adapter class for Named Entity Recognition (NER) tasks.
24
+
25
+ This adapter handles converting between BIO tagging schemes and XML-style entity markup,
26
+ and provides evaluation metrics using seqeval.
27
+
28
+ Subclasses should define their entity types and register the benchmark.
29
+ """
30
+
31
+ def __init__(self, **kwargs):
32
+ super().__init__(**kwargs)
33
+ # Define mapping from BIO tags to user-friendly tag names
34
+ self.entity_type_map = {}
35
+ # Add descriptions for each entity type
36
+ self.entity_descriptions = {}
37
+
38
+ # These will be initialized in setup_entity_mappings
39
+ self.reverse_entity_map = {}
40
+ self.entity_list = []
41
+ self.entities_description = ''
42
+
43
+ # Define common error patterns to handle
44
+ self.tag_fix_patterns = DEFAULT_TAG_FIX_PATTERNS
45
+
46
+ check_import('seqeval', 'seqeval', raise_error=True, feature_name='NER metrics')
47
+ # Note: setup_entity_mappings() should be called by subclasses
48
+ # after they define their entity_type_map and entity_descriptions
49
+
50
+ def setup_entity_mappings(self):
51
+ """
52
+ Setup entity mappings and descriptions for prompt formatting.
53
+ This should be called after entity_type_map and entity_descriptions are defined.
54
+ """
55
+ # Reverse mapping for converting back from prediction to evaluation
56
+ self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
57
+
58
+ # Create list of tags for prompt formatting
59
+ self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
60
+
61
+ # Create description of entities for prompt
62
+ self.entities_description = ', '.join([
63
+ f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
64
+ ])
65
+
66
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
67
+ """
68
+ Convert a record with tokens and NER tags into a Sample.
69
+ Creates both the raw text input and annotated text target.
70
+ """
71
+ tokens: List[str] = record['tokens']
72
+ ner_tags: List[str] = record['ner_tags']
73
+
74
+ # Create the input text by joining tokens
75
+ input_text = ' '.join(tokens)
76
+
77
+ # Process tokens and tags to create annotated target text
78
+ target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
79
+
80
+ # Store tokens and tags in metadata for evaluation
81
+ metadata = {'tokens': tokens, 'ner_tags': ner_tags}
82
+
83
+ return Sample(input=input_text, target=target_text, metadata=metadata)
84
+
85
+ def format_prompt_template(self, sample):
86
+ """
87
+ Format the prompt with entity types, available tags, and text to annotate.
88
+ """
89
+ return self.prompt_template.format(
90
+ entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
91
+ )
92
+
93
+ def format_fewshot_template(self, fewshot, sample):
94
+ """
95
+ Format the few-shot prompt with all required parameters.
96
+ """
97
+ return self.few_shot_prompt_template.format(
98
+ fewshot=fewshot,
99
+ entities=self.entities_description,
100
+ entity_list=', '.join(self.entity_list),
101
+ text=sample.input
102
+ )
103
+
104
+ def sample_to_fewshot(self, sample: Sample) -> str:
105
+ """
106
+ Format a sample as a few-shot example showing original and annotated text.
107
+ """
108
+ if not sample.metadata:
109
+ return ''
110
+
111
+ # Format few-shot examples to match the expected response format
112
+ return f'Input:\n{sample.input}\n\nOutput:\n{sample.target}'
113
+
114
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
115
+ """
116
+ Evaluate named entity recognition performance using seqeval.
117
+ """
118
+ from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
119
+
120
+ score = Score(
121
+ extracted_prediction=filtered_prediction,
122
+ prediction=original_prediction,
123
+ )
124
+
125
+ try:
126
+ # Get the original tokens and tags from the reference metadata
127
+ original_tokens = task_state.metadata['tokens']
128
+ original_tags = task_state.metadata['ner_tags']
129
+
130
+ if not original_tokens or len(original_tokens) == 0:
131
+ if hasattr(reference, 'metadata') and reference.metadata:
132
+ original_tokens = reference.metadata['tokens']
133
+ original_tags = reference.metadata['ner_tags']
134
+
135
+ # Clean and normalize the prediction
136
+ cleaned_prediction = clean_prediction(filtered_prediction, self.tag_fix_patterns)
137
+
138
+ # Convert XML-style prediction back to BIO tags aligned with original tokens
139
+ pred_bio_tags = xml_to_bio_tags(cleaned_prediction, original_tokens, self.reverse_entity_map)
140
+
141
+ # Use seqeval to calculate metrics
142
+ # Note: seqeval expects lists of lists (one per sequence)
143
+ y_true = [original_tags]
144
+ y_pred = [pred_bio_tags]
145
+
146
+ precision = precision_score(y_true, y_pred)
147
+ recall = recall_score(y_true, y_pred)
148
+ f1 = f1_score(y_true, y_pred)
149
+ accuracy = accuracy_score(y_true, y_pred)
150
+
151
+ score.value = {'precision': precision, 'recall': recall, 'f1_score': f1, 'accuracy': accuracy}
152
+
153
+ # Store tags for aggregation (proper micro-averaging in aggregate_scores)
154
+ # This way aggregate_scores can compute metrics across all samples at once,
155
+ # which gives you true micro-averaged scores rather than averaged macro scores.
156
+ score.metadata = {'y_true': original_tags, 'y_pred': pred_bio_tags}
157
+ except Exception as e:
158
+ logger.warning(f'Error evaluating NER prediction: {str(e)}')
159
+ score.value = {'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0, 'accuracy': 0.0}
160
+
161
+ return score
162
+
163
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
164
+ """
165
+ Aggregate metrics across all samples using seqeval.
166
+ """
167
+ from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
168
+
169
+ # Collect all predictions and references
170
+ y_true_all = []
171
+ y_pred_all = []
172
+
173
+ for ss in sample_scores:
174
+ # Extract the BIO tags from metadata if available
175
+ # You may need to store these during match_score
176
+ if hasattr(ss.score, 'metadata') and 'y_true' in ss.score.metadata and 'y_pred' in ss.score.metadata:
177
+ y_true_all.append(ss.score.metadata['y_true'])
178
+ y_pred_all.append(ss.score.metadata['y_pred'])
179
+
180
+ if not y_true_all:
181
+ # Fallback: calculate averages from individual scores
182
+ num_samples = len(sample_scores)
183
+ avg_precision = sum(ss.score.value.get('precision', 0.0) for ss in sample_scores) / num_samples
184
+ avg_recall = sum(ss.score.value.get('recall', 0.0) for ss in sample_scores) / num_samples
185
+ avg_f1 = sum(ss.score.value.get('f1_score', 0.0) for ss in sample_scores) / num_samples
186
+ avg_accuracy = sum(ss.score.value.get('accuracy', 0.0) for ss in sample_scores) / num_samples
187
+ else:
188
+ # Use seqeval for micro-averaged metrics across all samples
189
+ avg_precision = precision_score(y_true_all, y_pred_all)
190
+ avg_recall = recall_score(y_true_all, y_pred_all)
191
+ avg_f1 = f1_score(y_true_all, y_pred_all)
192
+ avg_accuracy = accuracy_score(y_true_all, y_pred_all)
193
+
194
+ num_samples = len(sample_scores)
195
+
196
+ agg_scores = [
197
+ AggScore(
198
+ metric_name='precision',
199
+ score=avg_precision,
200
+ num=num_samples,
201
+ metadata={'type': 'seqeval-micro-average'}
202
+ ),
203
+ AggScore(
204
+ metric_name='recall', score=avg_recall, num=num_samples, metadata={'type': 'seqeval-micro-average'}
205
+ ),
206
+ AggScore(metric_name='f1_score', score=avg_f1, num=num_samples, metadata={'type': 'seqeval-micro-average'}),
207
+ AggScore(
208
+ metric_name='accuracy', score=avg_accuracy, num=num_samples, metadata={'type': 'seqeval-accuracy'}
209
+ )
210
+ ]
211
+
212
+ return agg_scores
@@ -8,7 +8,7 @@ from evalscope.api.messages.content import ContentImage
8
8
  from evalscope.api.metric import Score
9
9
  from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput
10
10
  from evalscope.api.registry import get_metric
11
- from evalscope.constants import EvalType
11
+ from evalscope.constants import EvalType, FileConstants
12
12
  from evalscope.utils import get_logger
13
13
  from evalscope.utils.function_utils import thread_safe
14
14
  from .default_data_adapter import DefaultDataAdapter
@@ -19,6 +19,11 @@ logger = get_logger()
19
19
  class Text2ImageAdapter(DefaultDataAdapter):
20
20
  """Text to Image Adapter for benchmarks."""
21
21
 
22
+ def __init__(self, **kwargs):
23
+ super().__init__(**kwargs)
24
+
25
+ self.add_aggregation_name = False # Do not add aggregation name in the report by default
26
+
22
27
  def load_from_disk(self, **kwargs):
23
28
  return super().load_from_disk(use_local_loader=True)
24
29
 
@@ -27,11 +32,12 @@ class Text2ImageAdapter(DefaultDataAdapter):
27
32
  return Sample(
28
33
  input=[ChatMessageUser(content=record['prompt'])],
29
34
  metadata={
30
- 'id': record['id'],
31
35
  'prompt': record['prompt'],
32
36
  'category': record.get('category', ''),
33
37
  'tags': record.get('tags', []),
34
- 'image_path': record.get('image_path', ''), # Optional field for existing image path
38
+ FileConstants.ID: record[FileConstants.ID],
39
+ FileConstants.IMAGE_PATH: record.get(FileConstants.IMAGE_PATH,
40
+ ''), # Optional field for existing image path
35
41
  }
36
42
  )
37
43
 
@@ -83,7 +89,7 @@ class Text2ImageAdapter(DefaultDataAdapter):
83
89
  completed=True,
84
90
  )
85
91
  else:
86
- image_id = f"{sample.metadata.get('id',sample.id)}_{sample.group_id}"
92
+ image_id = f'{sample.metadata.get(FileConstants.ID, sample.id)}_{sample.group_id}'
87
93
  output_path = os.path.join(output_dir, 'images', f'{image_id}.png')
88
94
  if not os.path.exists(os.path.dirname(output_path)):
89
95
  os.makedirs(os.path.dirname(output_path))
@@ -96,7 +102,7 @@ class Text2ImageAdapter(DefaultDataAdapter):
96
102
  with open(output_path, 'wb') as f:
97
103
  f.write(base64.b64decode(image_base64))
98
104
 
99
- sample.metadata['image_path'] = output_path
105
+ sample.metadata[FileConstants.IMAGE_PATH] = output_path
100
106
  return TaskState(
101
107
  model=model.name,
102
108
  sample=sample,
@@ -111,7 +117,7 @@ class Text2ImageAdapter(DefaultDataAdapter):
111
117
  self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
112
118
  ) -> Score:
113
119
  # Get prediction and prompt from task state
114
- image_path = task_state.metadata.get('image_path', original_prediction)
120
+ image_path = task_state.metadata.get(FileConstants.IMAGE_PATH, original_prediction)
115
121
  prompt = task_state.input[0].content
116
122
  meta = task_state.metadata
117
123
 
@@ -149,7 +155,3 @@ class Text2ImageAdapter(DefaultDataAdapter):
149
155
  score.metadata[metric_name] = f'error: {str(e)}'
150
156
 
151
157
  return score
152
-
153
- def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
154
- # Don't add aggregation name for needle haystack adapter
155
- return super()._on_generate_report(scores, model_name, False)
@@ -0,0 +1,8 @@
1
+ from .default_data_adapter import DefaultDataAdapter
2
+
3
+
4
+ class VisionLanguageAdapter(DefaultDataAdapter):
5
+ """Adapter for vision-language benchmarks. e.g., image captioning, visual question answering, etc."""
6
+
7
+ def __init__(self, **kwargs):
8
+ super().__init__(**kwargs)