evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,349 @@
1
+ # flake8: noqa
2
+ import numpy as np
3
+ import os
4
+ import sys
5
+ from collections import defaultdict
6
+ from typing import Dict, List
7
+
8
+ from evalscope.utils import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ class End2EndEvaluator():
14
+
15
+ def __init__(
16
+ self,
17
+ prediction: List,
18
+ reference: List,
19
+ metrics: Dict,
20
+ match_method: str = 'quick_match',
21
+ filter_types: dict = None
22
+ ):
23
+
24
+ self.match_method = match_method
25
+ self.references = reference
26
+ self.predictions = prediction
27
+ self.dafault_metircs_dict = metrics
28
+
29
+ filtered_gt_samples = []
30
+ if filter_types:
31
+ for gt_sample in self.references:
32
+ select_flag = True
33
+ for k, v in filter_types.items():
34
+ if gt_sample['page_info']['page_attribute'][k] != v:
35
+ select_flag = False
36
+ if select_flag:
37
+ filtered_gt_samples.append(gt_sample)
38
+ else:
39
+ filtered_gt_samples = self.references #[{},{},{}]
40
+ self.references = filtered_gt_samples
41
+
42
+ def score(self) -> dict:
43
+ samples = self.get_matched_elements(self.references, self.predictions)
44
+ metrics = self.process_generated_metric_results(samples)
45
+ return metrics
46
+
47
+ def get_page_elements(self, selected_annos):
48
+ saved_element_dict = defaultdict(list)
49
+ related_truncated = []
50
+ truncated_all = {}
51
+ for relation in selected_annos['extra']['relation']: # Handle truncated text issues
52
+ if relation['relation_type'] == 'truncated':
53
+ truncated_all[relation['source_anno_id']] = ''
54
+ truncated_all[relation['target_anno_id']] = ''
55
+ exist_flag = False
56
+ for merge_list in related_truncated:
57
+ if relation['source_anno_id'] in merge_list or relation[
58
+ 'target_anno_id'] in merge_list: # Consider cases where three text blocks may need to be merged
59
+ merge_list.append(relation['source_anno_id'])
60
+ merge_list.append(relation['target_anno_id'])
61
+ exist_flag = True
62
+ if not exist_flag:
63
+ related_truncated.append([relation['source_anno_id'], relation['target_anno_id']])
64
+
65
+ for item in selected_annos['layout_dets']:
66
+ if item['anno_id'] not in truncated_all.keys():
67
+ saved_element_dict[item['category_type']].append(item)
68
+ else:
69
+ truncated_all[item['anno_id']] = item
70
+
71
+ for merge_list in related_truncated:
72
+ text_block_list = [truncated_all[key] for key in merge_list]
73
+ sorted_block = sorted(text_block_list, key=lambda x: x['order'])
74
+ text = ''
75
+ for block in sorted_block:
76
+ text += block['text']
77
+ merged_block = {
78
+ 'category_type': sorted_block[0]['category_type'], # Directly use information from the first block
79
+ 'order': sorted_block[0]['order'],
80
+ 'anno_id': sorted_block[0]['anno_id'],
81
+ 'text': text,
82
+ 'merge_list': sorted_block
83
+ }
84
+ saved_element_dict[sorted_block[0]['category_type']].append(merged_block)
85
+
86
+ return saved_element_dict
87
+
88
+ def get_page_elements_list(self, gt_page_elements, category_list):
89
+ element_list = []
90
+ for category_type in category_list:
91
+ if gt_page_elements.get(category_type):
92
+ element_list.extend(gt_page_elements[category_type])
93
+ return element_list
94
+
95
+ def get_sorted_text_list(self, selected_annos):
96
+ # txt_type: text, latex, html
97
+ text_list = []
98
+ for item in selected_annos:
99
+ if item.get('order'):
100
+ order = item['order']
101
+ else:
102
+ order = 0
103
+ # 【txt_type,selecte_annos]
104
+ text_list.append((order, item))
105
+ sorted_text_list = sorted(text_list, key=lambda x: x[0])
106
+ return [_[1] for _ in sorted_text_list]
107
+
108
+ def filtered_out_ignore(self, items, ignore_category_list):
109
+ filted_items = []
110
+ for item in items:
111
+ if item['gt_category_type'] not in ignore_category_list:
112
+ filted_items.append(item)
113
+ return filted_items
114
+
115
+ def get_order_paired(self, order_match_s, img_name):
116
+ matched = [(item['gt_position'], item['pred_position'])
117
+ for item in order_match_s
118
+ if (item['gt_position'] != [''] and item['pred_position'] != '')]
119
+ gt_idx_all = [item['gt_position'] for item in order_match_s if (item['gt_position'] != [''])]
120
+ read_order_pred = [i[0] for i in sorted(matched, key=lambda x: x[1])]
121
+ read_order_gt = sum(gt_idx_all, []) # Convert to one-dimensional list
122
+ read_order_gt = [x for x in read_order_gt if x]
123
+ gt = sorted(read_order_gt)
124
+ pred = sum(read_order_pred, [])
125
+ pred = [x for x in pred if x]
126
+ if len(pred) > 0 or len(gt) > 0:
127
+ import Levenshtein
128
+ edit = Levenshtein.distance(gt, pred) / max(len(pred), len(gt))
129
+ return {'gt': gt, 'pred': pred, 'img_id': img_name, 'edit': edit}
130
+ else:
131
+ return {} # If both GT and pred are empty for the page, return empty
132
+
133
+ def formula_format(self, formula_matches, img_name):
134
+ # formated_list = []
135
+ for i, item in enumerate(formula_matches):
136
+ item['img_id'] = img_name + '_' + str(i)
137
+ return formula_matches
138
+
139
+ def get_matched_elements(self, references: list, predictions: list) -> dict:
140
+ from .metrics import recogition_end2end_base_dataset, recogition_end2end_table_dataset
141
+
142
+ plain_text_match = []
143
+ display_formula_match = []
144
+ html_table_match = []
145
+ latex_table_match = []
146
+ order_match = []
147
+
148
+ for i, sample in enumerate(references):
149
+ img_name = os.path.basename(sample['page_info']['image_path'])
150
+ pred_content = predictions[i]
151
+ result = self.process_get_matched_elements(sample, pred_content, img_name)
152
+ [
153
+ plain_text_match_clean, formated_display_formula, latex_table_match_s, html_table_match_s,
154
+ order_match_single
155
+ ] = result
156
+
157
+ if order_match_single:
158
+ order_match.append(order_match_single)
159
+ if plain_text_match_clean:
160
+ plain_text_match.extend(plain_text_match_clean)
161
+ if formated_display_formula:
162
+ display_formula_match.extend(formated_display_formula)
163
+ if latex_table_match_s:
164
+ latex_table_match.extend(latex_table_match_s)
165
+ if html_table_match_s:
166
+ html_table_match.extend(html_table_match_s)
167
+
168
+ if len(latex_table_match) > len(html_table_match):
169
+ table_match = latex_table_match
170
+ table_format = 'latex'
171
+ else:
172
+ table_match = html_table_match
173
+ table_format = 'html'
174
+
175
+ matched_samples_all = {
176
+ 'text_block': recogition_end2end_base_dataset(plain_text_match),
177
+ 'display_formula': recogition_end2end_base_dataset(display_formula_match),
178
+ 'table': recogition_end2end_table_dataset(table_match, table_format),
179
+ 'reading_order': recogition_end2end_base_dataset(order_match)
180
+ }
181
+
182
+ return matched_samples_all
183
+
184
+ def process_get_matched_elements(self, sample, pred_content, img_name):
185
+ from func_timeout import FunctionTimedOut, func_timeout
186
+
187
+ from .utils import match_gt2pred_no_split, match_gt2pred_quick, match_gt2pred_simple, md_tex_filter
188
+
189
+ if self.match_method == 'simple_match': # add match choice
190
+ match_gt2pred = match_gt2pred_simple
191
+ elif self.match_method == 'quick_match':
192
+ match_gt2pred = match_gt2pred_quick
193
+ elif self.match_method == 'no_split':
194
+ match_gt2pred = match_gt2pred_no_split
195
+ else:
196
+ match_gt2pred = match_gt2pred_quick
197
+
198
+ pred_dataset = md_tex_filter(pred_content)
199
+ gt_page_elements = self.get_page_elements(sample)
200
+
201
+ text_all = self.get_page_elements_list(
202
+ gt_page_elements, [
203
+ 'text_block', 'title', 'code_txt', 'code_txt_caption', 'reference', 'equation_caption',
204
+ 'figure_caption', 'figure_footnote', 'table_caption', 'table_footnote', 'code_algorithm',
205
+ 'code_algorithm_caption', 'header', 'footer', 'page_footnote', 'page_number'
206
+ ]
207
+ )
208
+
209
+ display_formula_match_s = []
210
+ plain_text_match_clean = []
211
+ latex_table_match_s = []
212
+ html_table_match_s = []
213
+ order_match_single = []
214
+ if text_all:
215
+ gt_text_list = self.get_sorted_text_list(text_all)
216
+ try:
217
+ plain_text_match_s = func_timeout(
218
+ 30, match_gt2pred, args=(gt_text_list, pred_dataset['text_all'], 'text', img_name)
219
+ )
220
+ except FunctionTimedOut as e:
221
+ logger.warning(f'Time out for plain text match of {img_name}, match_gt2pred_simple will be used.')
222
+ plain_text_match_s = match_gt2pred_simple(gt_text_list, pred_dataset['text_all'], 'text', img_name)
223
+ logger.error(str(e))
224
+ raise e
225
+
226
+ if not plain_text_match_s:
227
+ logger.warning(f'No text match of {img_name}. The plain text match will be empty.')
228
+ else:
229
+ plain_text_match_clean = self.filtered_out_ignore(
230
+ plain_text_match_s, [
231
+ 'figure_caption', 'figure_footnote', 'table_caption', 'table_footnote', 'code_algorithm',
232
+ 'code_algorithm_caption', 'header', 'footer', 'page_footnote', 'page_number', 'equation_caption'
233
+ ]
234
+ )
235
+
236
+ if gt_page_elements.get('equation_isolated'):
237
+ gt_display_list = self.get_sorted_text_list(gt_page_elements['equation_isolated'])
238
+ display_formula_match_s = match_gt2pred(
239
+ gt_display_list, pred_dataset['equation_isolated'], 'formula', img_name
240
+ )
241
+ display_formula_match_s = [x for x in display_formula_match_s if x['gt_idx'] != ['']]
242
+ if not display_formula_match_s:
243
+ logger.warning(f'No display_formula_match of {img_name}. The display_formula_match will be empty.')
244
+
245
+ if gt_page_elements.get('table'):
246
+ gt_table_list = self.get_sorted_text_list(gt_page_elements['table'])
247
+ if pred_dataset['latex_table']:
248
+ latex_table_match_s = match_gt2pred_simple(
249
+ gt_table_list, pred_dataset['latex_table'], 'latex_table', img_name
250
+ )
251
+ latex_table_match_s = [x for x in latex_table_match_s if x['gt_idx'] != ['']]
252
+ if pred_dataset['html_table']:
253
+ html_table_match_s = match_gt2pred_simple(
254
+ gt_table_list, pred_dataset['html_table'], 'html_table', img_name
255
+ )
256
+ html_table_match_s = [x for x in html_table_match_s if x['gt_idx'] != ['']]
257
+ else:
258
+ html_table_match_s = match_gt2pred_simple(gt_table_list, [], 'html_table', img_name)
259
+ html_table_match_s = [x for x in html_table_match_s if x['gt_idx'] != ['']]
260
+
261
+ order_match_s = plain_text_match_clean
262
+ if order_match_s:
263
+ order_match_single = self.get_order_paired(order_match_s, img_name)
264
+
265
+ return [
266
+ plain_text_match_clean, display_formula_match_s, latex_table_match_s, html_table_match_s, order_match_single
267
+ ]
268
+
269
+ def process_generated_metric_results(self, samples, save_name: str = 'end2end_quick_match'):
270
+ from .metrics import METRIC_REGISTRY, get_full_labels_results, get_page_split, show_result
271
+
272
+ result_all = {}
273
+ page_info = {}
274
+ metircs_dict = self.dafault_metircs_dict
275
+ pages = self.references #gt_samples list
276
+
277
+ for page in pages:
278
+ img_path = os.path.basename(page['page_info']['image_path'])
279
+ page_info[img_path] = page['page_info']['page_attribute']
280
+
281
+ for element in metircs_dict.keys():
282
+
283
+ result = {}
284
+ group_info = metircs_dict[element].get('group', [])
285
+ # samples = samples.get(element) ##
286
+ cur_samples = samples[element]
287
+
288
+ for metric in metircs_dict[element]['metric']:
289
+ metric_val = METRIC_REGISTRY.get(metric)
290
+
291
+ cur_samples, result_s = metric_val(cur_samples).evaluate(group_info, f'{save_name}_{element}')
292
+ if result_s:
293
+ result.update(result_s)
294
+
295
+ if result:
296
+ logger.info(f'{element}')
297
+ show_result(result)
298
+ result_all[element] = {}
299
+
300
+ group_result = get_full_labels_results(cur_samples)
301
+ page_result = get_page_split(cur_samples, page_info)
302
+
303
+ result_all[element] = {'all': result, 'group': group_result, 'page': page_result}
304
+
305
+ save_dict = {}
306
+ en_overall = []
307
+ ch_overall = []
308
+ for category_type, metric in [('text_block', 'Edit_dist'), ('display_formula', 'Edit_dist'),
309
+ ('display_formula', 'CDM'), ('table', 'TEDS'), ('table', 'Edit_dist'),
310
+ ('reading_order', 'Edit_dist')]:
311
+ if metric == 'TEDS':
312
+ if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
313
+ category_type]['page']:
314
+ save_dict[category_type + '_' + metric
315
+ + '_EN'] = result_all[category_type]['page'][metric]['language: english']
316
+ save_dict[category_type + '_' + metric
317
+ + '_CH'] = result_all[category_type]['page'][metric]['language: simplified_chinese']
318
+ else:
319
+ save_dict[category_type + '_' + metric + '_EN'] = np.nan
320
+ save_dict[category_type + '_' + metric + '_CH'] = np.nan
321
+ else:
322
+ if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
323
+ category_type]['page']:
324
+ save_dict[category_type + '_' + metric
325
+ + '_EN'] = result_all[category_type]['page'][metric].get('language: english', np.nan)
326
+ save_dict[category_type + '_' + metric + '_CH'] = result_all[category_type]['page'][metric].get(
327
+ 'language: simplified_chinese', np.nan
328
+ )
329
+ else:
330
+ save_dict[category_type + '_' + metric + '_EN'] = np.nan
331
+ save_dict[category_type + '_' + metric + '_CH'] = np.nan
332
+
333
+ if metric == 'Edit_dist':
334
+ if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
335
+ category_type]['page']:
336
+ en_overall.append(result_all[category_type]['page'][metric].get('language: english', np.nan))
337
+ ch_overall.append(
338
+ result_all[category_type]['page'][metric].get('language: simplified_chinese', np.nan)
339
+ )
340
+ else:
341
+ en_overall.append(np.nan)
342
+ ch_overall.append(np.nan)
343
+
344
+ en_overall_filtered = [x for x in en_overall if not np.isnan(x)]
345
+ ch_overall_filtered = [x for x in ch_overall if not np.isnan(x)]
346
+ save_dict['overall_EN'] = sum(en_overall_filtered) / len(en_overall_filtered) if en_overall_filtered else np.nan
347
+ save_dict['overall_CH'] = sum(ch_overall_filtered) / len(ch_overall_filtered) if ch_overall_filtered else np.nan
348
+
349
+ return save_dict