evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,128 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.messages import ChatMessageUser, Content, ContentText
6
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+ DESCRIPTION = (
14
+ "CoinFlip is a symbolic reasoning dataset that tests an LLM's ability "
15
+ 'to track binary state changes through a sequence of actions. '
16
+ 'Each example describes whether a coin is flipped or not by different person, '
17
+ 'requiring logical inference to determine the final state (heads or tails).'
18
+ ) # noqa: E501
19
+
20
+ PROMPT_TEMPLATE = """
21
+ Solve the following coin flip problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
22
+
23
+ {question}
24
+
25
+ Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer YES or NO to the problem.
26
+
27
+ Reasoning:
28
+ """ # noqa: E501
29
+
30
+ FEWSHOT_TEMPLATE = """
31
+ Here are some examples of how to solve similar problems:
32
+
33
+ {fewshot}
34
+
35
+ """.lstrip() + PROMPT_TEMPLATE # noqa: E501
36
+
37
+
38
+ @register_benchmark(
39
+ BenchmarkMeta(
40
+ name='coin_flip',
41
+ pretty_name='CoinFlip',
42
+ tags=[Tags.REASONING, Tags.YES_NO],
43
+ description=DESCRIPTION.strip(),
44
+ dataset_id='extraordinarylab/coin-flip',
45
+ metric_list=['accuracy', 'precision', 'recall', 'f1_score', 'yes_ratio'],
46
+ aggregation='f1',
47
+ few_shot_num=0,
48
+ train_split='validation',
49
+ eval_split='test',
50
+ prompt_template=PROMPT_TEMPLATE,
51
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
52
+ )
53
+ )
54
+ class CoinFlipAdapter(DefaultDataAdapter):
55
+
56
+ def __init__(self, **kwargs):
57
+ super().__init__(**kwargs)
58
+ self.add_overall_metric = False
59
+
60
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
61
+ question = record['question']
62
+ answer = record['answer']
63
+ input_text = self.prompt_template.format(question=question)
64
+ content_list: List[Content] = [ContentText(text=input_text)]
65
+ answer = str(answer).upper() # 'YES' or 'NO'
66
+ return Sample(input=[ChatMessageUser(content=content_list)], target=answer, metadata={
67
+ 'answer': answer,
68
+ })
69
+
70
+ def extract_answer(self, prediction, task_state):
71
+ import re
72
+
73
+ match = re.search(r'ANSWER:\s*(.*)', prediction)
74
+ return match.group(1) if match else prediction
75
+
76
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
77
+ score = Score(
78
+ extracted_prediction=filtered_prediction,
79
+ prediction=original_prediction,
80
+ )
81
+ # Check for an exact match against the extracted answer.
82
+ result = 1 if reference in filtered_prediction else 0
83
+ score.value = {'acc': result}
84
+ return score
85
+
86
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
87
+ """
88
+ Custom aggregation to compute accuracy, precision, recall, f1_score, and yes_ratio.
89
+ """
90
+
91
+ tp = fp = tn = fn = 0
92
+ yes_count = 0
93
+ total_count = len(sample_scores)
94
+
95
+ for ss in sample_scores:
96
+ gt = ss.sample_metadata['answer'].strip().upper()
97
+ pred = ss.score.extracted_prediction.strip().upper()
98
+
99
+ if pred == 'YES':
100
+ yes_count += 1
101
+ if pred == 'YES' and gt == 'YES':
102
+ tp += 1
103
+ elif pred == 'YES' and gt == 'NO':
104
+ fp += 1
105
+ elif pred == 'NO' and gt == 'NO':
106
+ tn += 1
107
+ elif pred == 'NO' and gt == 'YES':
108
+ fn += 1
109
+
110
+ accuracy = (tp + tn) / total_count if total_count > 0 else 0.0
111
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
112
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
113
+ f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
114
+ yes_ratio = yes_count / total_count if total_count > 0 else 0.0
115
+
116
+ overall_metrics = {
117
+ 'accuracy': accuracy,
118
+ 'precision': precision,
119
+ 'recall': recall,
120
+ 'f1_score': f1_score,
121
+ 'yes_ratio': yes_ratio
122
+ }
123
+
124
+ agg_scores = []
125
+ for metric_name, value in overall_metrics.items():
126
+ agg_scores.append(AggScore(metric_name=metric_name, score=value, num=len(sample_scores), metadata={}))
127
+
128
+ return agg_scores
File without changes
@@ -0,0 +1,32 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
2
+ from evalscope.api.dataset import Sample
3
+ from evalscope.api.registry import register_benchmark
4
+ from evalscope.constants import Tags
5
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
6
+
7
+ DESCRIPTION = 'CommonsenseQA requires different types of commonsense knowledge to predict the correct answers.'
8
+
9
+
10
+ @register_benchmark(
11
+ BenchmarkMeta(
12
+ name='commonsense_qa',
13
+ pretty_name='CommonsenseQA',
14
+ tags=[Tags.REASONING, Tags.COMMONSENSE, Tags.MULTIPLE_CHOICE],
15
+ description=DESCRIPTION.strip(),
16
+ dataset_id='extraordinarylab/commonsense-qa',
17
+ metric_list=['acc'],
18
+ few_shot_num=0,
19
+ train_split=None,
20
+ eval_split='validation',
21
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
22
+ )
23
+ )
24
+ class CommonsenseQAAdapter(MultiChoiceAdapter):
25
+
26
+ def record_to_sample(self, record) -> Sample:
27
+ return Sample(
28
+ input=record['question'],
29
+ choices=record['choices'],
30
+ target=record['answer'],
31
+ metadata={},
32
+ )
@@ -71,3 +71,8 @@ class CompetitionMathAdapter(DefaultDataAdapter):
71
71
 
72
72
  def sample_to_fewshot(self, sample: Sample) -> str:
73
73
  return f'Problem:\n{sample.input}\nSolution:\n{sample.target}'
74
+
75
+ def extract_answer(self, prediction: str, task_state):
76
+ from evalscope.metrics.math_parser import extract_answer
77
+
78
+ return extract_answer(prediction)
@@ -6,9 +6,7 @@ from typing import Any, Dict, List
6
6
  from evalscope.api.benchmark import BenchmarkMeta, DataAdapter, DefaultDataAdapter
7
7
  from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
8
8
  from evalscope.api.evaluator import TaskState
9
- from evalscope.api.metric import Score
10
9
  from evalscope.api.metric.scorer import AggScore, SampleScore
11
- from evalscope.api.model.model import Model
12
10
  from evalscope.api.registry import get_benchmark, register_benchmark
13
11
  from evalscope.config import TaskConfig
14
12
  from evalscope.constants import DataCollection, Tags
@@ -22,8 +20,13 @@ logger = get_logger()
22
20
  @register_benchmark(
23
21
  BenchmarkMeta(
24
22
  name=DataCollection.NAME,
23
+ pretty_name='Data-Collection',
25
24
  dataset_id='', # dataset_id need to be set
26
- description='Data collection',
25
+ description='Custom Data collection, mixing multiple evaluation datasets for '
26
+ 'a unified evaluation, aiming to use less data to achieve a more comprehensive '
27
+ 'assessment of the model\'s capabilities. '
28
+ '[Usage Reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html)',
29
+ tags=[Tags.CUSTOM],
27
30
  metric_list=['acc'],
28
31
  eval_split='test',
29
32
  prompt_template='',
@@ -55,9 +58,10 @@ class DataCollectionAdapter(DefaultDataAdapter):
55
58
  data_id_or_path=dataset_path,
56
59
  split=self.eval_split,
57
60
  sample_fields=self.record_to_sample,
58
- subset=self.default_subset,
61
+ subset='test', # NOTE: using hardcoded test subset
59
62
  limit=self.limit,
60
- repeats=self.repeats
63
+ repeats=self.repeats,
64
+ shuffle=self.shuffle,
61
65
  ).load()
62
66
 
63
67
  test_dataset = DatasetDict({self.default_subset: dataset})
@@ -95,7 +99,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
95
99
 
96
100
  # load dataset args
97
101
  dataset_args = copy.deepcopy(self._task_config.dataset_args)
98
- common_args = dataset_args.get(DataCollection.NAME, {})
99
102
 
100
103
  # Iterate through each sample in the dataset
101
104
  dataset = self.test_dataset[self.default_subset]
@@ -108,7 +111,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
108
111
 
109
112
  # update dataset args
110
113
  cur_dataset_args = dataset_args.get(dataset_name, {})
111
- cur_dataset_args.update(common_args)
112
114
 
113
115
  # Initialize dataset adapter
114
116
  if dataset_name not in self.dataset_adapters:
@@ -141,19 +143,22 @@ class DataCollectionAdapter(DefaultDataAdapter):
141
143
  data = []
142
144
  for sample_score in sample_scores:
143
145
  collection_info = sample_score.sample_metadata[DataCollection.INFO]
144
- for metric_name, value in sample_score.score.value.items():
145
- data.append(
146
- dict(
147
- task_type=collection_info['task_type'],
148
- categories=tuple(collection_info['categories']),
149
- dataset_name=collection_info['dataset_name'],
150
- subset_name=collection_info['subset_name'],
151
- tags=collection_info['tags'],
152
- sample_id=sample_score.sample_id,
153
- metric=metric_name,
154
- score=value
155
- )
146
+ main_score = sample_score.score.main_value
147
+ main_metric = sample_score.score.main_score_name
148
+
149
+ # use main score
150
+ data.append(
151
+ dict(
152
+ task_type=collection_info['task_type'],
153
+ categories=tuple(collection_info['categories']),
154
+ dataset_name=collection_info['dataset_name'],
155
+ subset_name=collection_info['subset_name'],
156
+ tags=collection_info['tags'],
157
+ sample_id=sample_score.sample_id,
158
+ metric=main_metric,
159
+ score=main_score
156
160
  )
161
+ )
157
162
 
158
163
  df = pd.DataFrame(data)
159
164
 
File without changes
@@ -0,0 +1,67 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator.state import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ PROMPT = """Answer the question according to the image using a single word or phrase.
16
+ {question}
17
+ The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.""" # noqa: E501
18
+
19
+
20
+ @register_benchmark(
21
+ BenchmarkMeta(
22
+ name='docvqa',
23
+ pretty_name='DocVQA',
24
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
25
+ description=
26
+ 'DocVQA (Document Visual Question Answering) is a benchmark designed to evaluate AI systems on their ability to answer questions based on the content of document images, such as scanned pages, forms, or invoices. Unlike general visual question answering, it requires understanding not just the text extracted by OCR, but also the complex layout, structure, and visual elements of a document.', # noqa: E501
27
+ dataset_id='lmms-lab/DocVQA',
28
+ subset_list=['DocVQA'],
29
+ metric_list=['anls'],
30
+ eval_split='validation',
31
+ prompt_template=PROMPT,
32
+ )
33
+ )
34
+ class DocVQAAdapter(VisionLanguageAdapter):
35
+
36
+ def __init__(self, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.add_aggregation_name = False
39
+
40
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
41
+
42
+ input_text = PROMPT.format(question=record['question'])
43
+ content_list: List[Content] = [ContentText(text=input_text)]
44
+ image = record.get('image')
45
+ if image:
46
+ image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
47
+ content_list.append(ContentImage(image=image_base64))
48
+ return Sample(
49
+ input=[ChatMessageUser(content=content_list)],
50
+ target=json.dumps(record.get('answers')), # answers is a list
51
+ metadata={
52
+ 'questionId': record.get('questionId'),
53
+ 'question_types': record.get('question_types'),
54
+ 'docId': record.get('docId'),
55
+ 'ucsf_document_id': record.get('ucsf_document_id'),
56
+ 'ucsf_document_page_no': record.get('ucsf_document_page_no'),
57
+ }
58
+ )
59
+
60
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
61
+ import re
62
+
63
+ pattern = r'ANSWER:\s*(.*)'
64
+ match = re.search(pattern, prediction)
65
+ if match:
66
+ return match.group(1).strip()
67
+ return prediction.strip()
File without changes
@@ -0,0 +1,170 @@
1
+ # flake8: noqa: E501
2
+
3
+ from typing import Any, Dict, List
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentText
8
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ DESCRIPTION = (
14
+ 'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
15
+ 'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
16
+ 'or rhetorically subversive.'
17
+ )
18
+
19
+ PROMPT_TEMPLATE = """
20
+ #Instruction#:
21
+ Classify whether the given text is a Drivelology sample or not.
22
+
23
+ #Definition#:
24
+ - Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
25
+ These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
26
+ often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
27
+ emotional insight to unravel their true significance.
28
+ - non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
29
+ statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
30
+ or proverbs, that convey clear or straightforward information without the layered complexity
31
+ characteristic of Drivelology.
32
+
33
+ #Output Format#:
34
+ You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
35
+ The answer you give MUST be \"Yes\" or \"No\"".
36
+
37
+ #Input Text#: {text}
38
+ #Your Answer#:
39
+ """.strip() # noqa: E501
40
+
41
+ FEWSHOT_PROMPT_TEMPLATE = """
42
+ #Instruction#:
43
+ Classify whether the given text is a Drivelology sample or not.
44
+
45
+ #Definition#:
46
+ - Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
47
+ These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
48
+ often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
49
+ emotional insight to unravel their true significance.
50
+ - non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
51
+ statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
52
+ or proverbs, that convey clear or straightforward information without the layered complexity
53
+ characteristic of Drivelology.
54
+
55
+ #Output Format#:
56
+ You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
57
+ The answer you give MUST be \"Yes\" or \"No\"".
58
+
59
+ Here are some examples of how to solve similar problems:
60
+
61
+ #Input Text#: Saw a book called "how to solve 50 percent of your problems" so I bought 2 books.
62
+ #Your Answer#: Yes
63
+
64
+ #Input Text#: Colourless green ideas sleep furiously.
65
+ #Your Answer#: No
66
+
67
+ #Input Text#: I went to a restaurant, and saw this guy was choking. I gotta save him. And then I realized he was just speaking French.
68
+ #Your Answer#: Yes
69
+
70
+ #Input Text#: Either it is or it isn't.
71
+ #Your Answer#: No
72
+
73
+ #Input Text#: {text}
74
+ #Your Answer#:
75
+ """.strip() # noqa: E501
76
+
77
+ logger = get_logger()
78
+
79
+
80
+ @register_benchmark(
81
+ BenchmarkMeta(
82
+ name='drivel_binary',
83
+ pretty_name='DrivelologyBinaryClassification',
84
+ tags=[Tags.YES_NO],
85
+ description=DESCRIPTION.strip(),
86
+ dataset_id='extraordinarylab/drivel-hub',
87
+ subset_list=['binary-classification'],
88
+ metric_list=['accuracy', 'precision', 'recall', 'f1_score', 'yes_ratio'],
89
+ aggregation='f1',
90
+ few_shot_num=0,
91
+ eval_split='test',
92
+ prompt_template='{question}',
93
+ few_shot_prompt_template='{question}'
94
+ )
95
+ )
96
+ class DrivelologyBinaryClassificationAdapter(DefaultDataAdapter):
97
+
98
+ def __init__(self, **kwargs):
99
+ super().__init__(**kwargs)
100
+ self.add_overall_metric = False
101
+ if self.few_shot_num not in [0, 4]:
102
+ logger.warning(f'For DrivelologyBinaryClassification, use 4-shot by default.')
103
+ self.few_shot_num = 4
104
+
105
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
106
+ if self.few_shot_num > 0:
107
+ prompt = FEWSHOT_PROMPT_TEMPLATE.format(text=record['text'])
108
+ else:
109
+ prompt = PROMPT_TEMPLATE.format(text=record['text'])
110
+ content_list: List[Content] = [ContentText(text=prompt)]
111
+ answer = 'YES' if str(record['label']) == 'drivelology' else 'NO' # 'YES' or 'NO'
112
+ return Sample(input=[ChatMessageUser(content=content_list)], target=answer, metadata={
113
+ 'answer': answer,
114
+ })
115
+
116
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
117
+ score = Score(
118
+ extracted_prediction=filtered_prediction,
119
+ prediction=original_prediction,
120
+ )
121
+ # Check if the reference answer is in the filtered prediction
122
+ result = 1 if reference in filtered_prediction.strip().upper() else 0
123
+ score.value = {'acc': result}
124
+ return score
125
+
126
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
127
+ """
128
+ Custom aggregation to compute accuracy, precision, recall, f1_score, and yes_ratio.
129
+ """
130
+
131
+ def compute_metrics(scores: List[SampleScore]):
132
+ tp = fp = tn = fn = 0
133
+ yes_count = 0
134
+ total_count = len(scores)
135
+
136
+ for ss in scores:
137
+ gt = ss.sample_metadata['answer'].strip().upper()
138
+ # Get prediction based on score
139
+ pred = gt if ss.score.main_value == 1 else ('NO' if gt == 'YES' else 'YES')
140
+ if pred == 'YES':
141
+ yes_count += 1
142
+ if pred == 'YES' and gt == 'YES':
143
+ tp += 1
144
+ elif pred == 'YES' and gt == 'NO':
145
+ fp += 1
146
+ elif pred == 'NO' and gt == 'NO':
147
+ tn += 1
148
+ elif pred == 'NO' and gt == 'YES':
149
+ fn += 1
150
+
151
+ accuracy = (tp + tn) / total_count if total_count > 0 else 0.0
152
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
153
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
154
+ f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
155
+ yes_ratio = yes_count / total_count if total_count > 0 else 0.0
156
+
157
+ return {
158
+ 'accuracy': accuracy,
159
+ 'precision': precision,
160
+ 'recall': recall,
161
+ 'f1_score': f1_score,
162
+ 'yes_ratio': yes_ratio
163
+ }
164
+
165
+ overall_metrics = compute_metrics(sample_scores)
166
+ agg_scores = []
167
+ for metric_name, value in overall_metrics.items():
168
+ agg_scores.append(AggScore(metric_name=metric_name, score=value, num=len(sample_scores), metadata={}))
169
+
170
+ return agg_scores