evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,565 @@
1
+ # flake8: noqa
2
+ import math
3
+ import re
4
+ import sympy as sp
5
+ from latex2sympy2_extended import latex2sympy as parse_latex
6
+ from sympy import Eq, Pow, simplify, sympify
7
+
8
+ from evalscope.utils import get_logger
9
+
10
+ # from sympy.parsing.latex import parse_latex
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ def get_single_answer_type_text(answer_type, is_chinese):
16
+ if '-' in answer_type: # No need now
17
+ answer_type = answer_type[:answer_type.find('-')]
18
+ chinese_answer_type_dict = {
19
+ 'Numerical': '数值',
20
+ 'Expression': '表达式',
21
+ 'Equation': '方程',
22
+ 'Interval': '区间',
23
+ }
24
+ english_answer_type_dict = {
25
+ 'Numerical': 'a numerical value',
26
+ 'Expression': 'an expression',
27
+ 'Equation': 'an equation',
28
+ 'Interval': 'an interval',
29
+ }
30
+
31
+ for t in ['Numerical', 'Expression', 'Equation', 'Interval']:
32
+ if t in answer_type:
33
+ if is_chinese:
34
+ return chinese_answer_type_dict[t]
35
+ else:
36
+ return english_answer_type_dict[t]
37
+ raise ValueError(f'Error parsing answer type {answer_type}!')
38
+
39
+
40
+ def get_answer_type_text(answer_type, is_chinese, multiple_answer):
41
+ if ('Need_human_evaluate' in answer_type) or ('Tuple' in answer_type):
42
+ return ''
43
+ if not multiple_answer:
44
+ answer_text = get_single_answer_type_text(answer_type, is_chinese)
45
+ if is_chinese:
46
+ return f',答案类型为{answer_text}'
47
+ else:
48
+ return (f'The answer of The problem should be '
49
+ f'{answer_text}. ')
50
+ # Multiple answers case
51
+ if ',' not in answer_type: # Same answer type for all answers
52
+ answer_text = get_single_answer_type_text(answer_type, is_chinese)
53
+ if is_chinese:
54
+ return f',题目有多个答案,答案类型均为{answer_text}'
55
+ else:
56
+ return (f'The problem has multiple answers, each of them '
57
+ f'should be {answer_text}. ')
58
+ # Different answer types
59
+ answer_types = answer_type.split(',')
60
+ answer_types = [get_single_answer_type_text(t, is_chinese) for t in answer_types]
61
+ if len(set(answer_types)) == 1:
62
+ answer_text = answer_types[0]
63
+ if is_chinese:
64
+ return f',题目有多个答案,答案类型均为{answer_text}'
65
+ else:
66
+ return (f'The problem has multiple answers, each of them '
67
+ f'should be {answer_text}. ')
68
+ else:
69
+ if is_chinese:
70
+ answer_text = '、'.join(answer_types)
71
+ return f',题目有多个答案,答案类型分别为{answer_text}'
72
+ else:
73
+ answer_text = ', '.join(answer_types)
74
+ return (f'The problem has multiple answers, '
75
+ f'with the answers in order being {answer_text}. ')
76
+
77
+
78
+ class OlympiadBenchPrompter:
79
+
80
+ def __init__(self):
81
+ pass
82
+
83
+ def make_prompt(
84
+ self,
85
+ problem,
86
+ language,
87
+ subject,
88
+ question_type,
89
+ answer_type,
90
+ is_multiple_answer,
91
+ unit,
92
+ ):
93
+ self.is_chinese = language == 'Chinese'
94
+ self.is_math = subject == 'Math'
95
+ self.is_theorem_proving = question_type == 'Theorem proof'
96
+ """Generate prompt based on question properties."""
97
+ if self.is_chinese:
98
+ subject_content = '数学' if self.is_math else '物理'
99
+ if self.is_theorem_proving:
100
+ prompt = (f'以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,'
101
+ f'运用逻辑推理及常用定理证明题目中的命题。证明过程中使用的变量和公式请使用LaTeX格式表示。')
102
+ else:
103
+ answer_type_text = get_answer_type_text(
104
+ answer_type,
105
+ is_chinese=True,
106
+ multiple_answer=is_multiple_answer,
107
+ )
108
+ if is_multiple_answer:
109
+ multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
110
+ else:
111
+ multiple_answer_text = '\\boxed{答案}'
112
+ unit_text = ''
113
+ if unit:
114
+ multiple_answer_text += '(单位)'
115
+ unit_text = ',注意答案的单位不要放在\\boxed{}中'
116
+ prompt = (
117
+ f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。'
118
+ f'请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的'
119
+ f'变量和公式请使用LaTeX格式表示。请在最后以"所以最终答案是'
120
+ f'{multiple_answer_text}。"显式给出结果{unit_text}。'
121
+ )
122
+ else:
123
+ subject_content = 'Math' if self.is_math else 'Physics'
124
+ if self.is_theorem_proving:
125
+ prompt = (
126
+ f'The following is a theorem proving problem from an '
127
+ f'International {subject_content} competition. Please use '
128
+ f'logical reasoning and common theorems to prove the '
129
+ f'proposition in the problem according to the given '
130
+ f'requirements. Please use LaTeX format to represent the '
131
+ f'variables and formulas used in the proof.'
132
+ )
133
+ else:
134
+ if is_multiple_answer:
135
+ multiple_answer_text = ('\\boxed{multiple answers connected with commas}')
136
+ else:
137
+ multiple_answer_text = '\\boxed{answer}'
138
+ unit_text = ''
139
+ if unit:
140
+ multiple_answer_text += '(unit)'
141
+ unit_text = (', note that the unit of the answer should '
142
+ 'not be included in \\boxed{}')
143
+ answer_type_text = get_answer_type_text(
144
+ answer_type,
145
+ is_chinese=False,
146
+ multiple_answer=is_multiple_answer,
147
+ )
148
+ prompt = (
149
+ f'The following is an open-ended problem from an '
150
+ f'International {subject_content} competition. '
151
+ f'{answer_type_text}Please calculate the answer according '
152
+ f'to the given requirements and the information provided. '
153
+ f'Please use LaTeX format to represent the variables and '
154
+ f'formulas used in the solution process and results. '
155
+ f'Please end your solution with "So the final answer is '
156
+ f'{multiple_answer_text}." and give the result explicitly'
157
+ f'{unit_text}.'
158
+ )
159
+ # Add problem statement to the prompt
160
+ prompt = prompt + '\n' + problem + '\n'
161
+ # Add step-by-step reasoning instruction
162
+ if self.is_chinese:
163
+ prompt += '\n请通过逐步推理来解答问题,并把最终答案放置于\\boxed{}中。'
164
+ else:
165
+ prompt += ('\nPlease reason step by step, and put your final '
166
+ 'answer within \\boxed{}.')
167
+ return prompt
168
+
169
+
170
+ # Evaluate
171
+
172
+
173
+ class MathJudger:
174
+
175
+ def __init__(self):
176
+ self.special_signal_map = {
177
+ '\\left': '',
178
+ '\\right': '',
179
+ '∶': ':',
180
+ ',': ',',
181
+ '$': '',
182
+ '\\approx': '=',
183
+ '\\simeq': '=',
184
+ '\\sim': '=',
185
+ '^\\prime': "'",
186
+ '^{\\prime}': "'",
187
+ '^\\circ': '',
188
+ '%': '',
189
+ }
190
+ self.pi = parse_latex('\\pi')
191
+ self.precision = 1e-8
192
+
193
+ def split_by_comma(self, expr: str):
194
+ in_bracket_num = 0
195
+ splitted_expr = []
196
+ start_idx = 0
197
+ for i, char in enumerate(expr):
198
+ if char == '(' or char == '[':
199
+ in_bracket_num += 1
200
+ elif char == ')' or char == ']':
201
+ in_bracket_num -= 1
202
+ elif char == ',' and in_bracket_num == 0:
203
+ splitted_expr.append(expr[start_idx:i].strip())
204
+ start_idx = i + 1
205
+
206
+ if start_idx < len(expr):
207
+ splitted_expr.append(expr[start_idx:].strip())
208
+
209
+ return splitted_expr
210
+
211
+ def trans_plus_minus_sign(self, expr_list: list):
212
+ new_expr_list = []
213
+ for expr in expr_list:
214
+ if '\\pm' in expr:
215
+ new_expr_list.append(expr.replace('\\pm', '+'))
216
+ new_expr_list.append(expr.replace('\\pm', '-'))
217
+ else:
218
+ new_expr_list.append(expr)
219
+
220
+ return new_expr_list
221
+
222
+ def judge(self, expression1, expression2, precision=1e-8):
223
+ # (默认 expression1 为 Ground_Truth)
224
+ precision = precision if type(precision) == list else [precision]
225
+
226
+ try:
227
+ expression1, expression2 = self.preprocess(expression1, expression2)
228
+ except Exception: # 处理具体异常
229
+ return False
230
+ if expression1 == expression2:
231
+ return True
232
+
233
+ # 去除字符串中的中文字符
234
+ expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1)
235
+ expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2)
236
+
237
+ expression1 = self.split_by_comma(expression1)
238
+ expression2 = self.split_by_comma(expression2)
239
+
240
+ temp_list1 = self.trans_plus_minus_sign(expression1)
241
+ temp_list2 = self.trans_plus_minus_sign(expression2)
242
+
243
+ # 设计误差值列表
244
+ if len(precision) <= 1:
245
+ precision = precision * len(temp_list1)
246
+
247
+ if len(temp_list1) != len(temp_list2):
248
+ return False
249
+
250
+ # 判断两个列表中的元素是否可以两两配对,并且两两相等
251
+ idx = -1
252
+ while len(temp_list1) != 0:
253
+ idx = (idx + 1) % len(temp_list1)
254
+
255
+ item1 = temp_list1[idx]
256
+ self.precision = precision[idx]
257
+
258
+ for item2 in temp_list2:
259
+ if self.is_equal(item1, item2):
260
+ temp_list1.remove(item1)
261
+ temp_list2.remove(item2)
262
+ precision.remove(self.precision)
263
+ break
264
+ else:
265
+ return False
266
+
267
+ # 如果所有元素都匹配并移除,列表可以配对
268
+ return True
269
+
270
+ def is_interval(self, epr):
271
+ return epr.startswith(('(', '[')) and epr.endswith((')', ']'))
272
+
273
+ def sympy_sub_pi(self, expression_sympy):
274
+ return expression_sympy.subs(self.pi, math.pi)
275
+
276
+ def is_equal(self, expression1, expression2):
277
+ if (expression1 == expression2 and expression1 != '' and expression2 != ''):
278
+ return True
279
+
280
+ # 先判断是否是两个区间
281
+ if self.is_interval(expression1) and self.is_interval(expression2):
282
+ try:
283
+ if self.interval_equal(expression1, expression2):
284
+ return True
285
+ except Exception: # 处理具体异常
286
+ return False
287
+
288
+ # 再判断是否在数值上相等
289
+ try:
290
+ if self.numerical_equal(expression1, expression2):
291
+ return True
292
+ except Exception: # 处理具体异常
293
+ pass
294
+
295
+ # 再判断是否是表达式相等
296
+ try:
297
+ if self.expression_equal(expression1, expression2) and not ('=' in expression1 and '=' in expression2):
298
+ return True
299
+ except Exception: # 处理具体异常
300
+ pass
301
+
302
+ # 再判断是否是等式相等
303
+ try:
304
+ if self.equation_equal(expression1, expression2):
305
+ return True
306
+ except Exception: # 处理具体异常
307
+ pass
308
+
309
+ return False
310
+
311
+ def numerical_equal(
312
+ self,
313
+ expression1: str,
314
+ expression2: str,
315
+ include_percentage: bool = True,
316
+ ):
317
+ """(默认 expression1 为 Ground_Truth) 函数: 判读两个数值是否在误差允许范围内相等 步骤1:
318
+
319
+ 将可能出现的百分号的情况包含进来 步骤2: 使用 math.isclose 函数判断是否相等.
320
+ """
321
+ reference = float(expression1)
322
+ prediction = float(expression2)
323
+
324
+ if include_percentage:
325
+ gt_result = [reference / 100, reference, reference * 100]
326
+ else:
327
+ gt_result = [reference]
328
+
329
+ for item in gt_result:
330
+ if abs(item - prediction) <= self.precision * 1.01:
331
+ return True
332
+ return False
333
+
334
+ def expression_equal(self, exp1, exp2):
335
+ """(默认 expression1 为 Ground_Truth) 函数: 判断两个表达式是否在数学意义上等价 步骤1: 提取表达式,
336
+ 防止有的模型会给出"x=1"而不是"1" 步骤2: 使用 sympy 库进行等价判断."""
337
+
338
+ # 只提取等号右边的表达式
339
+ def extract_expression(expression):
340
+ if '=' in expression:
341
+ expression = expression.split('=')[1]
342
+ return expression.strip()
343
+
344
+ exp1 = extract_expression(exp1)
345
+ exp2 = extract_expression(exp2)
346
+
347
+ # 将表达式转换为 sympy 中能够进行处理的格式
348
+ expr1_sym = sympify(parse_latex(exp1))
349
+ expr2_sym = sympify(parse_latex(exp2))
350
+
351
+ if expr1_sym == expr2_sym:
352
+ return True
353
+ else:
354
+ expr1_sym = self.sympy_sub_pi(expr1_sym)
355
+ expr2_sym = self.sympy_sub_pi(expr2_sym)
356
+
357
+ if (expr1_sym.has(sp.Symbol)
358
+ and not expr2_sym.has(sp.Symbol)) or (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
359
+ return False
360
+ elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
361
+ try:
362
+ if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
363
+ logger.warning(
364
+ f'These two number can not be calculated by '
365
+ f'current computer for: '
366
+ f'"{str(expr1_sym)}" and "{str(expr2_sym)}"'
367
+ )
368
+ return False
369
+
370
+ if (abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01):
371
+ return True
372
+ else:
373
+ return False
374
+ except Exception: # 处理具体异常
375
+ return False
376
+ else:
377
+ try:
378
+ simplified_expr = simplify(expr1_sym - expr2_sym)
379
+
380
+ num_value = simplified_expr.evalf()
381
+
382
+ return abs(num_value) < 1e-3
383
+ except Exception: # 处理具体异常
384
+ return False
385
+
386
+ def equation_equal(self, expression1, expression2):
387
+ """
388
+ (expression1 is assumed to be Ground_Truth)
389
+ Function: Check if two equations are mathematically equivalent
390
+ Step 1: Simplify equations to standard form with right side equal to 0
391
+ Step 2: Use sympy library to calculate quotient of left sides,
392
+ if quotient or its reciprocal is integer, equations are equivalent
393
+ """
394
+
395
+ # Convert equations to sympy format with right side moved to left side
396
+ def simplify_equation(latex_eq):
397
+ # Split left and right sides of equation
398
+ lhs, rhs = latex_eq.split('=')
399
+
400
+ # Parse LaTeX expressions using parse_latex
401
+ lhs_expr = parse_latex(lhs)
402
+ rhs_expr = parse_latex(rhs)
403
+
404
+ # Create equation object
405
+ equation = Eq(lhs_expr, rhs_expr)
406
+
407
+ # Simplify equation by moving right side to left
408
+ simplified_eq = simplify(equation.lhs - equation.rhs)
409
+
410
+ return simplified_eq
411
+
412
+ expr1_sym = simplify_equation(expression1)
413
+ expr2_sym = simplify_equation(expression2)
414
+
415
+ division_result_1 = simplify(expr1_sym / expr2_sym)
416
+ division_result_2 = simplify(expr2_sym / expr1_sym)
417
+
418
+ # If division result or its reciprocal is
419
+ # non-zero integer, equations are equivalent
420
+ if (division_result_1.is_Integer
421
+ and division_result_1 != 0) or (division_result_2.is_Integer and division_result_2 != 0):
422
+ return True
423
+ else:
424
+ return False
425
+
426
+ def interval_equal(self, expression1, expression2):
427
+ """
428
+ Function: Check if two intervals are mathematically equivalent
429
+ Step 1: Simplify interval expressions,
430
+ remove irrelevant symbols
431
+ like "\\left", "\\right", and "x \\in"
432
+ Step 2: Compare brackets and mathematical expressions in between
433
+ """
434
+
435
+ def compare_two_interval(inter1, inter2):
436
+ # First compare brackets on both sides
437
+ if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
438
+ return False
439
+
440
+ inter1 = inter1.strip('[]()')
441
+ inter2 = inter2.strip('[]()')
442
+
443
+ # Split interval into left and right parts
444
+ items_1 = inter1.split(',')
445
+ items_2 = inter2.split(',')
446
+
447
+ for item_1, item_2 in zip(items_1, items_2):
448
+ if not self.expression_equal(item_1, item_2):
449
+ return False
450
+ return True
451
+
452
+ interval1 = expression1
453
+ interval2 = expression2
454
+
455
+ if interval1 == interval2:
456
+ return True
457
+ else:
458
+ inter_list1 = interval1.split('\\cup')
459
+ inter_list2 = interval2.split('\\cup')
460
+
461
+ if len(inter_list1) != len(inter_list2):
462
+ return False
463
+ else:
464
+ for inter1, inter2 in zip(inter_list1, inter_list2):
465
+ if not compare_two_interval(inter1, inter2):
466
+ return False
467
+ return True
468
+
469
+ def preprocess(self, expression1, expression2):
470
+ """Extract and preprocess expressions from model output."""
471
+
472
+ def extract_boxed_content(latex_str):
473
+ # Find all \boxed{...} structures
474
+ boxed_matches = re.finditer(r'\\boxed{', latex_str)
475
+ results = ''
476
+
477
+ for match in boxed_matches:
478
+ start_index = match.end()
479
+ end_index = start_index
480
+ stack = 1
481
+
482
+ # Search from after \boxed{ until
483
+ # finding matching closing brace
484
+ while stack > 0 and end_index < len(latex_str):
485
+ if latex_str[end_index] == '{':
486
+ stack += 1
487
+ elif latex_str[end_index] == '}':
488
+ stack -= 1
489
+ end_index += 1
490
+
491
+ if stack == 0:
492
+ # Extract content inside \boxed{}
493
+ content = latex_str[start_index:end_index - 1]
494
+ results += content + ','
495
+ else:
496
+ raise ValueError('Mismatched braces in LaTeX string.')
497
+
498
+ # If no \boxed{} found, extract formulas from last line
499
+ if results == '':
500
+ last_line_ans = latex_str.strip().split('\n')[-1]
501
+ dollar_pattern = r'\$(.*?)\$'
502
+ answers = re.findall(dollar_pattern, last_line_ans)
503
+
504
+ if answers:
505
+ for ans in answers:
506
+ results += ans + ','
507
+ else:
508
+ results = latex_str
509
+
510
+ return results
511
+
512
+ def special_symbol_replace(expression):
513
+ if '\\in ' in expression:
514
+ expression = expression.split('\\in ')[1]
515
+
516
+ # Replace special characters that
517
+ # don't affect LaTeX parsing (decorative)
518
+ for signal in self.special_signal_map:
519
+ expression = expression.replace(signal, self.special_signal_map[signal])
520
+
521
+ expression = expression.strip('\n$,.:;^_=+`!@#$%^&*~,。')
522
+
523
+ pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
524
+ expression = re.sub(pattern, r'\1', expression)
525
+
526
+ return expression
527
+
528
+ exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
529
+ exp1, exp2 = special_symbol_replace(exp1), special_symbol_replace(exp2)
530
+
531
+ return exp1, exp2
532
+
533
+ def can_compute_power(self, expr):
534
+ """Check if the power expression can be computed.
535
+
536
+ Parameters:
537
+ expr (sympy expression): The expression to check.
538
+
539
+ Returns:
540
+ bool: True if the expression can be computed, False otherwise.
541
+ """
542
+ # Check if the expression is a power expression
543
+ if isinstance(expr, Pow):
544
+ # Extract the base and the exponent
545
+ base, exp = expr.as_base_exp()
546
+
547
+ # Check if the base and the exponent are numbers
548
+ if base.is_number and exp.is_number:
549
+ # Set a threshold for the maximum size of the exponent
550
+ # can be adjusted based on the computing environment
551
+ MAX_EXP = 1000
552
+
553
+ # Check if the exponent is greater than the threshold
554
+ if abs(exp.evalf()) > MAX_EXP:
555
+ return False
556
+ else:
557
+ return True
558
+ else:
559
+ # If the base or the exponent is not a number,
560
+ # we cannot compute the power
561
+ return False
562
+ else:
563
+ # If the expression is not a power expression,
564
+ # return True as it is not the case we are checking for
565
+ return True
File without changes
@@ -0,0 +1,86 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.messages import ChatMessageUser, Content, ContentAudio, ContentImage, ContentText
6
+ from evalscope.api.registry import register_benchmark
7
+ from evalscope.constants import Tags
8
+ from evalscope.utils.import_utils import check_import
9
+ from evalscope.utils.io_utils import bytes_to_base64
10
+ from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.multi_choices import prompt
12
+
13
+ logger = get_logger()
14
+
15
+ MULT_CHOICE_PROMPT = r"""
16
+ Answer the following multiple choice question based on the image and audio content. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
17
+
18
+ {question}
19
+
20
+ {choices}
21
+ """.strip() # noqa: E501
22
+
23
+
24
+ @register_benchmark(
25
+ BenchmarkMeta(
26
+ name='omni_bench',
27
+ pretty_name='OmniBench',
28
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
29
+ description=
30
+ 'OmniBench, a pioneering universal multimodal benchmark designed to rigorously evaluate MLLMs\' capability to recognize, interpret, and reason across visual, acoustic, and textual inputs simultaneously.', # noqa: E501
31
+ dataset_id='m-a-p/OmniBench',
32
+ metric_list=['acc'],
33
+ eval_split='train',
34
+ prompt_template=MULT_CHOICE_PROMPT,
35
+ extra_params={
36
+ 'use_image': True, # Whether to use image input, if False, use text alternative image content.
37
+ 'use_audio': True, # Whether to use audio input, if False, use text alternative audio content.
38
+ }
39
+ )
40
+ )
41
+ class OmniBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
42
+
43
+ def __init__(self, **kwargs):
44
+ super().__init__(**kwargs)
45
+
46
+ self.use_image = self.extra_params.get('use_image', True)
47
+ self.use_audio = self.extra_params.get('use_audio', True)
48
+
49
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
50
+ question = record['question']
51
+ options = record['options']
52
+ answer = record['answer']
53
+ answer_char = chr(ord('A') + options.index(answer))
54
+
55
+ input_text = prompt(question=question, choices=options, template=MULT_CHOICE_PROMPT)
56
+ content_list: List[Content] = [ContentText(text=input_text)]
57
+
58
+ if self.use_image:
59
+ image_base64 = bytes_to_base64(record['image']['bytes'], format='png', add_header=True)
60
+ content_list.append(ContentImage(image=image_base64))
61
+ else:
62
+ alt_image = record['image content']
63
+ content_list.append(ContentText(text=f'[Image Alternative Text]: {alt_image}'))
64
+
65
+ if self.use_audio:
66
+ audio_base64 = bytes_to_base64(
67
+ record['audio']['bytes'], format='mp3', add_header=True, content_type='audio'
68
+ )
69
+ content_list.append(ContentAudio(audio=audio_base64, format='mp3'))
70
+ else:
71
+ alt_audio = record['audio content']
72
+ content_list.append(ContentText(text=f'[Audio Alternative Text]: {alt_audio}'))
73
+
74
+ return Sample(
75
+ input=[ChatMessageUser(content=content_list)],
76
+ choices=options,
77
+ target=answer_char,
78
+ metadata={
79
+ 'index': record['index'],
80
+ 'task_type': record['task type'],
81
+ 'audio_type': record['audio type'],
82
+ 'answer': answer,
83
+ 'image_content': record['image content'],
84
+ 'audio_content': record['audio content'],
85
+ }
86
+ )
File without changes