evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,161 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator.state import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.metric.scorer import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.report.report import Report, Subset
12
+ from evalscope.utils.import_utils import check_import
13
+ from evalscope.utils.io_utils import bytes_to_base64
14
+ from evalscope.utils.logger import get_logger
15
+
16
+ logger = get_logger()
17
+
18
+ SUBSET_LIST = [
19
+ 'APP agent en', 'ASCII art classification en', 'key information extraction cn', 'key information extraction en',
20
+ 'key information mapping en', 'VQA with position en', 'chart parsing en', 'cognition VQA cn', 'cognition VQA en',
21
+ 'diagram QA en', 'document classification en', 'document parsing cn', 'document parsing en',
22
+ 'formula recognition cn', 'formula recognition en', 'handwritten answer extraction cn', 'math QA en',
23
+ 'full-page OCR cn', 'full-page OCR en', 'reasoning VQA en', 'reasoning VQA cn', 'fine-grained text recognition en',
24
+ 'science QA en', 'table parsing cn', 'table parsing en', 'text counting en', 'text grounding en',
25
+ 'text recognition en', 'text spotting en', 'text translation cn'
26
+ ]
27
+
28
+
29
+ @register_benchmark(
30
+ BenchmarkMeta(
31
+ name='ocr_bench_v2',
32
+ pretty_name='OCRBench-v2',
33
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
34
+ description=
35
+ 'OCRBench v2 is a large-scale bilingual text-centric benchmark with currently the most comprehensive set of tasks (4x more tasks than the previous multi-scene benchmark OCRBench), the widest coverage of scenarios (31 diverse scenarios including street scene, receipt, formula, diagram, and so on), and thorough evaluation metrics, with a total of 10, 000 human-verified question-answering pairs and a high proportion of difficult samples.', # noqa: E501
36
+ dataset_id='evalscope/OCRBench_v2',
37
+ subset_list=SUBSET_LIST,
38
+ metric_list=['acc'],
39
+ eval_split='test',
40
+ prompt_template='{question}',
41
+ )
42
+ )
43
+ class OCRBenchV2Adapter(VisionLanguageAdapter):
44
+
45
+ def __init__(self, **kwargs):
46
+ super().__init__(**kwargs)
47
+ self.add_aggregation_name = False
48
+ self.reformat_subset = True
49
+
50
+ check_import(
51
+ module_name=['apted', 'distance', 'editdistance', 'Levenshtein', 'lxml', 'Polygon', 'zss'],
52
+ package=['apted', 'distance', 'editdistance', 'Levenshtein', 'lxml', 'Polygon3', 'zss'],
53
+ raise_error=True,
54
+ feature_name='OCRBench-v2 benchmark'
55
+ )
56
+
57
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
58
+
59
+ input_text = self.prompt_template.format(question=record['question'])
60
+ content_list: List[Content] = [ContentText(text=input_text)]
61
+ image = record.get('image')
62
+ if image:
63
+ image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
64
+ content_list.append(ContentImage(image=image_base64))
65
+
66
+ return Sample(
67
+ input=[ChatMessageUser(content=content_list)],
68
+ target=json.dumps(record.get('answers'), ensure_ascii=False), # answers is a list
69
+ subset_key=record.get('type'),
70
+ metadata={
71
+ 'question': record.get('question'),
72
+ 'answers': record.get('answers'),
73
+ 'eval': record.get('eval'),
74
+ 'dataset_name': record.get('dataset_name'),
75
+ 'type': record.get('type'),
76
+ 'bbox': record.get('bbox'),
77
+ 'bbox_list': record.get('bbox_list'),
78
+ 'content': record.get('content'),
79
+ }
80
+ )
81
+
82
+ def match_score(
83
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
84
+ ) -> Score:
85
+ from .utils import ocrbench_v2_process_results
86
+
87
+ score = Score(
88
+ extracted_prediction=filtered_prediction,
89
+ prediction=original_prediction,
90
+ )
91
+
92
+ doc = task_state.metadata
93
+ pred = filtered_prediction
94
+
95
+ score_value = ocrbench_v2_process_results(doc, pred)
96
+
97
+ score.value = {'acc': score_value}
98
+ return score
99
+
100
+ def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
101
+ """
102
+ Finalize the report generation process. Calculate the overall score.
103
+ """
104
+ # Lazy import to avoid changing top-level imports
105
+ from evalscope.report import Category, weighted_average_from_subsets
106
+
107
+ for metric in report.metrics:
108
+ # Collect all subsets in a dictionary for easy access
109
+ subset_dict: Dict[str, Subset] = {}
110
+ for category in metric.categories:
111
+ for subset in category.subsets:
112
+ subset_dict[subset.name] = subset
113
+
114
+ # Define category groupings (per utils.ocrbench_v2_aggregate_accuracy)
115
+ en_categories = {
116
+ 'text_recognition_en': ['text recognition en', 'fine-grained text recognition en', 'full-page OCR en'],
117
+ 'text_detection_en': ['text grounding en', 'VQA with position en'],
118
+ 'text_spotting_en': ['text spotting en'],
119
+ 'relationship_extraction_en': ['key information extraction en', 'key information mapping en'],
120
+ 'element_parsing_en':
121
+ ['document parsing en', 'chart parsing en', 'table parsing en', 'formula recognition en'],
122
+ 'mathematical_calculation_en': ['math QA en', 'text counting en'],
123
+ 'visual_text_understanding_en': ['document classification en', 'cognition VQA en', 'diagram QA en'],
124
+ 'knowledge_reasoning_en':
125
+ ['reasoning VQA en', 'science QA en', 'APP agent en', 'ASCII art classification en'],
126
+ }
127
+ cn_categories = {
128
+ 'text_recognition_cn': ['full-page OCR cn'],
129
+ 'relationship_extraction_cn': ['key information extraction cn', 'handwritten answer extraction cn'],
130
+ 'element_parsing_cn': ['document parsing cn', 'table parsing cn', 'formula recognition cn'],
131
+ 'visual_text_understanding_cn': ['cognition VQA cn'],
132
+ 'knowledge_reasoning_cn': ['reasoning VQA cn', 'text translation cn'],
133
+ }
134
+
135
+ # Compute per-category scores (unweighted average of member subsets)
136
+ for cat_name, sub_names in en_categories.items():
137
+ subset_dict[cat_name] = weighted_average_from_subsets(sub_names, subset_dict)
138
+ for cat_name, sub_names in cn_categories.items():
139
+ subset_dict[cat_name] = weighted_average_from_subsets(sub_names, subset_dict)
140
+
141
+ # Compute EN (average of EN category scores) and CN (average of CN category scores)
142
+ en_cat_names = list(en_categories.keys())
143
+ cn_cat_names = list(cn_categories.keys())
144
+ subset_dict['EN'] = weighted_average_from_subsets(en_cat_names, subset_dict)
145
+ subset_dict['CN'] = weighted_average_from_subsets(cn_cat_names, subset_dict)
146
+
147
+ # Compute OVERALL (average of EN and CN)
148
+ subset_dict['OVERALL'] = weighted_average_from_subsets(['EN', 'CN'], subset_dict)
149
+
150
+ # Prepare and append a dummy category to show all computed aggregates
151
+ all_computed = en_cat_names + cn_cat_names + ['EN', 'CN', 'OVERALL']
152
+ dummy_subsets = []
153
+ for name in all_computed:
154
+ if name in subset_dict:
155
+ s = subset_dict[name]
156
+ if s.num > 0:
157
+ s.name = name # Ensure the name is set correctly
158
+ dummy_subsets.append(s)
159
+
160
+ if dummy_subsets:
161
+ metric.categories.append(Category(name='-', subsets=dummy_subsets))
@@ -0,0 +1,50 @@
1
+ import jieba
2
+ import nltk
3
+ import re
4
+ from nltk.metrics import f_measure, precision, recall
5
+ from nltk.translate import meteor_score
6
+
7
+ from evalscope.utils.function_utils import thread_safe
8
+
9
+
10
+ def contain_chinese_string(text):
11
+ chinese_pattern = re.compile(r'[\u4e00-\u9fa5]')
12
+ return bool(chinese_pattern.search(text))
13
+
14
+
15
+ @thread_safe
16
+ def cal_per_metrics(pred, gt):
17
+ metrics = {}
18
+
19
+ if contain_chinese_string(gt) or contain_chinese_string(pred):
20
+ reference = jieba.lcut(gt)
21
+ hypothesis = jieba.lcut(pred)
22
+ else:
23
+ reference = gt.split()
24
+ hypothesis = pred.split()
25
+
26
+ metrics['bleu'] = nltk.translate.bleu([reference], hypothesis)
27
+ metrics['meteor'] = meteor_score.meteor_score([reference], hypothesis)
28
+
29
+ reference = set(reference)
30
+ hypothesis = set(hypothesis)
31
+ metrics['f_measure'] = f_measure(reference, hypothesis)
32
+
33
+ metrics['precision'] = precision(reference, hypothesis)
34
+ metrics['recall'] = recall(reference, hypothesis)
35
+ metrics['edit_dist'] = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
36
+ return metrics
37
+
38
+
39
+ if __name__ == '__main__':
40
+ # Examples for region text recognition and read all text tasks
41
+ predict_text = "metrics['edit_dist'] = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))"
42
+ true_text = 'metrics = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))'
43
+
44
+ scores = cal_per_metrics(predict_text, true_text)
45
+
46
+ predict_text = "metrics['edit_dist'] len(gt))"
47
+ true_text = 'metrics = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))'
48
+
49
+ scores = cal_per_metrics(predict_text, true_text)
50
+ print(scores)
@@ -0,0 +1,46 @@
1
+ from concurrent.futures import ProcessPoolExecutor, as_completed
2
+ from tqdm import tqdm
3
+
4
+
5
+ def parallel_process(array, function, n_jobs=16, use_kwargs=False, front_num=0):
6
+ """
7
+ A parallel version of the map function with a progress bar.
8
+
9
+ Args:
10
+ array (array-like): An array to iterate over.
11
+ function (function): A python function to apply to the elements of array
12
+ n_jobs (int, default=16): The number of cores to use
13
+ use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of
14
+ keyword arguments to function
15
+ front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job.
16
+ Useful for catching bugs
17
+ Returns:
18
+ [function(array[0]), function(array[1]), ...]
19
+ """
20
+ # We run the first few iterations serially to catch bugs
21
+ if front_num > 0:
22
+ front = [function(**a) if use_kwargs else function(a) for a in array[:front_num]]
23
+ else:
24
+ front = []
25
+ # If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging.
26
+ if n_jobs == 1:
27
+ return front + [function(**a) if use_kwargs else function(a) for a in tqdm(array[front_num:])]
28
+ # Assemble the workers
29
+ with ProcessPoolExecutor(max_workers=n_jobs) as pool:
30
+ # Pass the elements of array into function
31
+ if use_kwargs:
32
+ futures = [pool.submit(function, **a) for a in array[front_num:]]
33
+ else:
34
+ futures = [pool.submit(function, a) for a in array[front_num:]]
35
+ kwargs = {'total': len(futures), 'unit': 'it', 'unit_scale': True, 'leave': True}
36
+ # Print out the progress as tasks complete
37
+ for f in tqdm(as_completed(futures), **kwargs):
38
+ pass
39
+ out = []
40
+ # Get the results from the futures.
41
+ for i, future in tqdm(enumerate(futures)):
42
+ try:
43
+ out.append(future.result())
44
+ except Exception as e:
45
+ out.append(e)
46
+ return front + out
@@ -0,0 +1,26 @@
1
+ INSTRUCTIONS FOR THE STANDALONE SCRIPTS
2
+ Requirements:
3
+ - Python version 3.
4
+ - Each Task requires different Python modules. When running the script, if some module is not installed you will see a notification and installation instructions.
5
+
6
+ Procedure:
7
+ Download the ZIP file for the requested script and unzip it to a directory.
8
+
9
+ Open a terminal in the directory and run the command:
10
+ python script.py –g=gt.zip –s=submit.zip
11
+
12
+ If you have already installed all the required modules, then you will see the method’s results or an error message if the submitted file is not correct.
13
+
14
+ If a module is not present, you should install them with PIP: pip install 'module'
15
+
16
+ In case of Polygon module, use: 'pip install Polygon3'
17
+
18
+ parameters:
19
+ -g: Path of the Ground Truth file. In most cases, the Ground Truth will be included in the same Zip file named 'gt.zip', gt.txt' or 'gt.json'. If not, you will be able to get it on the Downloads page of the Task.
20
+ -s: Path of your method's results file.
21
+
22
+ Optional parameters:
23
+ -o: Path to a directory where to copy the file ‘results.zip’ that contains per-sample results.
24
+ -p: JSON string parameters to override the script default parameters. The parameters that can be overrided are inside the function 'default_evaluation_params' located at the begining of the evaluation Script.
25
+
26
+ Example: python script.py –g=gt.zip –s=submit.zip –o=./ -p={\"IOU_CONSTRAINT\":0.8}