evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,481 @@
1
+ # flake8: noqa
2
+ #!/usr/bin/env python
3
+ # -*- coding: utf-8 -*-
4
+ # encoding=utf8
5
+ # File: E2E_iou_1_1.py
6
+ # Version: 1.1
7
+ # Version info: changes for Python 3
8
+ # Date: 2019-12-29
9
+ # Description: Evaluation script that computes End to End Recognition. For Text Localization it's used Intersection over Union criteria.
10
+ # Average Precision is also calcuted when 'CONFIDENCES' parameter is True
11
+ # There are 2 modes to determine if a detection is correct or not:
12
+ # with Word Spotting: The detected word must coincide (ingnoring case) to a filtered Ground Truth containing only dictionary words (see include_in_dictionary and include_in_dictionary_transcription functions)
13
+ # without Word Spotting: words must be equal excluding a set of special characters
14
+
15
+ import importlib
16
+ from collections import namedtuple
17
+
18
+ from . import rrc_evaluation_funcs_1_1 as rrc_evaluation_funcs
19
+
20
+
21
+ def evaluation_imports():
22
+ """
23
+ evaluation_imports: Dictionary ( key = module name , value = alias ) with python modules used in the evaluation.
24
+ """
25
+ return {'Polygon': 'plg', 'numpy': 'np'}
26
+
27
+
28
+ def default_evaluation_params():
29
+ """
30
+ default_evaluation_params: Default parameters to use for the validation and evaluation.
31
+ """
32
+ return {
33
+ 'IOU_CONSTRAINT': 0.5,
34
+ 'AREA_PRECISION_CONSTRAINT': 0.5,
35
+ 'WORD_SPOTTING': False,
36
+ 'MIN_LENGTH_CARE_WORD': 3,
37
+ 'GT_SAMPLE_NAME_2_ID': 'gt_img_([0-9]+).txt',
38
+ 'DET_SAMPLE_NAME_2_ID': 'res_img_([0-9]+).txt',
39
+ 'LTRB': False, # LTRB:2points(left,top,right,bottom) or 4 points(x1,y1,x2,y2,x3,y3,x4,y4)
40
+ 'CRLF': False, # Lines are delimited by Windows CRLF format
41
+ 'CONFIDENCES': False, # Detections must include confidence value. AP will be calculated,
42
+ 'SPECIAL_CHARACTERS': "!?.:,*\"()·[]/'",
43
+ 'ONLY_REMOVE_FIRST_LAST_CHARACTER': True,
44
+ }
45
+
46
+
47
+ def validate_data(gtFilePath, submFilePath, evaluationParams):
48
+ """
49
+ Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
50
+ Validates also that there are no missing files in the folder.
51
+ If some error detected, the method raises the error
52
+ """
53
+ gt = rrc_evaluation_funcs.load_zip_file(gtFilePath, evaluationParams['GT_SAMPLE_NAME_2_ID'])
54
+
55
+ subm = rrc_evaluation_funcs.load_zip_file(submFilePath, evaluationParams['DET_SAMPLE_NAME_2_ID'], True)
56
+
57
+ # Validate format of GroundTruth
58
+ for k in gt:
59
+ rrc_evaluation_funcs.validate_lines_in_file(k, gt[k], evaluationParams['CRLF'], evaluationParams['LTRB'], True)
60
+
61
+ # Validate format of results
62
+ for k in subm:
63
+ if (k in gt) == False:
64
+ raise Exception('The sample %s not present in GT' % k)
65
+
66
+ rrc_evaluation_funcs.validate_lines_in_file(
67
+ k, subm[k], evaluationParams['CRLF'], evaluationParams['LTRB'], True, evaluationParams['CONFIDENCES']
68
+ )
69
+
70
+
71
+ def evaluate_method(gtFilePath, submFilePath, evaluationParams):
72
+ """
73
+ Method evaluate_method: evaluate method and returns the results
74
+ Results. Dictionary with the following values:
75
+ - method (required) Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
76
+ - samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
77
+ """
78
+ for module, alias in evaluation_imports().items():
79
+ globals()[alias] = importlib.import_module(module)
80
+
81
+ def polygon_from_points(points, correctOffset=False):
82
+ """
83
+ Returns a Polygon object to use with the Polygon2 class from a list of 8 points: x1,y1,x2,y2,x3,y3,x4,y4
84
+ """
85
+
86
+ if correctOffset: # this will substract 1 from the coordinates that correspond to the xmax and ymax
87
+ points[2] -= 1
88
+ points[4] -= 1
89
+ points[5] -= 1
90
+ points[7] -= 1
91
+
92
+ resBoxes = np.empty([1, 8], dtype='int32')
93
+ resBoxes[0, 0] = int(points[0])
94
+ resBoxes[0, 4] = int(points[1])
95
+ resBoxes[0, 1] = int(points[2])
96
+ resBoxes[0, 5] = int(points[3])
97
+ resBoxes[0, 2] = int(points[4])
98
+ resBoxes[0, 6] = int(points[5])
99
+ resBoxes[0, 3] = int(points[6])
100
+ resBoxes[0, 7] = int(points[7])
101
+ pointMat = resBoxes[0].reshape([2, 4]).T
102
+ return plg.Polygon(pointMat)
103
+
104
+ def rectangle_to_polygon(rect):
105
+ resBoxes = np.empty([1, 8], dtype='int32')
106
+ resBoxes[0, 0] = int(rect.xmin)
107
+ resBoxes[0, 4] = int(rect.ymax)
108
+ resBoxes[0, 1] = int(rect.xmin)
109
+ resBoxes[0, 5] = int(rect.ymin)
110
+ resBoxes[0, 2] = int(rect.xmax)
111
+ resBoxes[0, 6] = int(rect.ymin)
112
+ resBoxes[0, 3] = int(rect.xmax)
113
+ resBoxes[0, 7] = int(rect.ymax)
114
+
115
+ pointMat = resBoxes[0].reshape([2, 4]).T
116
+
117
+ return plg.Polygon(pointMat)
118
+
119
+ def rectangle_to_points(rect):
120
+ points = [
121
+ int(rect.xmin),
122
+ int(rect.ymax),
123
+ int(rect.xmax),
124
+ int(rect.ymax),
125
+ int(rect.xmax),
126
+ int(rect.ymin),
127
+ int(rect.xmin),
128
+ int(rect.ymin)
129
+ ]
130
+ return points
131
+
132
+ def get_union(pD, pG):
133
+ areaA = pD.area()
134
+ areaB = pG.area()
135
+ return areaA + areaB - get_intersection(pD, pG)
136
+
137
+ def get_intersection_over_union(pD, pG):
138
+ try:
139
+ return get_intersection(pD, pG) / get_union(pD, pG)
140
+ except:
141
+ return 0
142
+
143
+ def get_intersection(pD, pG):
144
+ pInt = pD & pG
145
+ if len(pInt) == 0:
146
+ return 0
147
+ return pInt.area()
148
+
149
+ def compute_ap(confList, matchList, numGtCare):
150
+ correct = 0
151
+ AP = 0
152
+ if len(confList) > 0:
153
+ confList = np.array(confList)
154
+ matchList = np.array(matchList)
155
+ sorted_ind = np.argsort(-confList)
156
+ confList = confList[sorted_ind]
157
+ matchList = matchList[sorted_ind]
158
+ for n in range(len(confList)):
159
+ match = matchList[n]
160
+ if match:
161
+ correct += 1
162
+ AP += float(correct) / (n + 1)
163
+
164
+ if numGtCare > 0:
165
+ AP /= numGtCare
166
+
167
+ return AP
168
+
169
+ def transcription_match(
170
+ transGt, transDet, specialCharacters="!?.:,*\"()·[]/'", onlyRemoveFirstLastCharacterGT=True
171
+ ):
172
+ if onlyRemoveFirstLastCharacterGT:
173
+ # special characters in GT are allowed only at initial or final position
174
+ if transGt == transDet:
175
+ return True
176
+
177
+ if specialCharacters.find(transGt[0]) > -1:
178
+ if transGt[1:] == transDet:
179
+ return True
180
+
181
+ if specialCharacters.find(transGt[-1]) > -1:
182
+ if transGt[0:len(transGt) - 1] == transDet:
183
+ return True
184
+
185
+ if specialCharacters.find(transGt[0]) > -1 and specialCharacters.find(transGt[-1]) > -1:
186
+ if transGt[1:len(transGt) - 1] == transDet:
187
+ return True
188
+ return False
189
+ else:
190
+ # Special characters are removed from the begining and the end of both Detection and GroundTruth
191
+ while len(transGt) > 0 and specialCharacters.find(transGt[0]) > -1:
192
+ transGt = transGt[1:]
193
+
194
+ while len(transDet) > 0 and specialCharacters.find(transDet[0]) > -1:
195
+ transDet = transDet[1:]
196
+
197
+ while len(transGt) > 0 and specialCharacters.find(transGt[-1]) > -1:
198
+ transGt = transGt[0:len(transGt) - 1]
199
+
200
+ while len(transDet) > 0 and specialCharacters.find(transDet[-1]) > -1:
201
+ transDet = transDet[0:len(transDet) - 1]
202
+
203
+ return transGt == transDet
204
+
205
+ def include_in_dictionary(transcription):
206
+ """
207
+ Function used in Word Spotting that finds if the Ground Truth transcription meets the rules to enter into the dictionary. If not, the transcription will be cared as don't care
208
+ """
209
+ # special case 's at final
210
+ if transcription[len(transcription) - 2:] == "'s" or transcription[len(transcription) - 2:] == "'S":
211
+ transcription = transcription[0:len(transcription) - 2]
212
+
213
+ # hypens at init or final of the word
214
+ transcription = transcription.strip('-')
215
+
216
+ specialCharacters = "'!?.:,*\"()·[]/"
217
+ for character in specialCharacters:
218
+ transcription = transcription.replace(character, ' ')
219
+
220
+ transcription = transcription.strip()
221
+
222
+ if len(transcription) != len(transcription.replace(' ', '')):
223
+ return False
224
+
225
+ if len(transcription) < evaluationParams['MIN_LENGTH_CARE_WORD']:
226
+ return False
227
+
228
+ notAllowed = '×÷·'
229
+
230
+ range1 = [ord('a'), ord('z')]
231
+ range2 = [ord('A'), ord('Z')]
232
+ range3 = [ord('À'), ord('ƿ')]
233
+ range4 = [ord('DŽ'), ord('ɿ')]
234
+ range5 = [ord('Ά'), ord('Ͽ')]
235
+ range6 = [ord('-'), ord('-')]
236
+
237
+ for char in transcription:
238
+ charCode = ord(char)
239
+ if notAllowed.find(char) != -1:
240
+ return False
241
+
242
+ valid = ((charCode >= range1[0] and charCode <= range1[1])
243
+ or (charCode >= range2[0] and charCode <= range2[1])
244
+ or (charCode >= range3[0] and charCode <= range3[1])
245
+ or (charCode >= range4[0] and charCode <= range4[1])
246
+ or (charCode >= range5[0] and charCode <= range5[1])
247
+ or (charCode >= range6[0] and charCode <= range6[1]))
248
+ if valid == False:
249
+ return False
250
+
251
+ return True
252
+
253
+ def include_in_dictionary_transcription(transcription):
254
+ """
255
+ Function applied to the Ground Truth transcriptions used in Word Spotting. It removes special characters or terminations
256
+ """
257
+ # special case 's at final
258
+ if transcription[len(transcription) - 2:] == "'s" or transcription[len(transcription) - 2:] == "'S":
259
+ transcription = transcription[0:len(transcription) - 2]
260
+
261
+ # hypens at init or final of the word
262
+ transcription = transcription.strip('-')
263
+
264
+ specialCharacters = "'!?.:,*\"()·[]/"
265
+ for character in specialCharacters:
266
+ transcription = transcription.replace(character, ' ')
267
+
268
+ transcription = transcription.strip()
269
+
270
+ return transcription
271
+
272
+ perSampleMetrics = {}
273
+
274
+ matchedSum = 0
275
+
276
+ Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax')
277
+
278
+ gt = rrc_evaluation_funcs.load_zip_file(gtFilePath, evaluationParams['GT_SAMPLE_NAME_2_ID'])
279
+ subm = rrc_evaluation_funcs.load_zip_file(submFilePath, evaluationParams['DET_SAMPLE_NAME_2_ID'], True)
280
+
281
+ numGlobalCareGt = 0
282
+ numGlobalCareDet = 0
283
+
284
+ arrGlobalConfidences = []
285
+ arrGlobalMatches = []
286
+
287
+ for resFile in gt:
288
+ gtFile = rrc_evaluation_funcs.decode_utf8(gt[resFile])
289
+ if gtFile is None:
290
+ raise Exception('The file %s is not UTF-8' % resFile)
291
+
292
+ recall = 0
293
+ precision = 0
294
+ hmean = 0
295
+ detCorrect = 0
296
+ iouMat = np.empty([1, 1])
297
+ gtPols = []
298
+ detPols = []
299
+ gtTrans = []
300
+ detTrans = []
301
+ gtPolPoints = []
302
+ detPolPoints = []
303
+ gtDontCarePolsNum = [] # Array of Ground Truth Polygons' keys marked as don't Care
304
+ detDontCarePolsNum = [] # Array of Detected Polygons' matched with a don't Care GT
305
+ detMatchedNums = []
306
+ pairs = []
307
+
308
+ arrSampleConfidences = []
309
+ arrSampleMatch = []
310
+ sampleAP = 0
311
+
312
+ evaluationLog = ''
313
+
314
+ pointsList, _, transcriptionsList = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
315
+ gtFile, evaluationParams['CRLF'], evaluationParams['LTRB'], True, False
316
+ )
317
+ for n in range(len(pointsList)):
318
+ points = pointsList[n]
319
+ transcription = transcriptionsList[n]
320
+ dontCare = transcription == '###'
321
+ if evaluationParams['LTRB']:
322
+ gtRect = Rectangle(*points)
323
+ gtPol = rectangle_to_polygon(gtRect)
324
+ else:
325
+ gtPol = polygon_from_points(points)
326
+ gtPols.append(gtPol)
327
+ gtPolPoints.append(points)
328
+
329
+ # On word spotting we will filter some transcriptions with special characters
330
+ if evaluationParams['WORD_SPOTTING']:
331
+ if dontCare == False:
332
+ if include_in_dictionary(transcription) == False:
333
+ dontCare = True
334
+ else:
335
+ transcription = include_in_dictionary_transcription(transcription)
336
+
337
+ gtTrans.append(transcription)
338
+ if dontCare:
339
+ gtDontCarePolsNum.append(len(gtPols) - 1)
340
+
341
+ evaluationLog += 'GT polygons: ' + str(
342
+ len(gtPols)
343
+ ) + (' (' + str(len(gtDontCarePolsNum)) + " don't care)\n" if len(gtDontCarePolsNum) > 0 else '\n')
344
+
345
+ if resFile in subm:
346
+ detFile = rrc_evaluation_funcs.decode_utf8(subm[resFile])
347
+
348
+ pointsList, confidencesList, transcriptionsList = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
349
+ detFile, evaluationParams['CRLF'], evaluationParams['LTRB'], True, evaluationParams['CONFIDENCES']
350
+ )
351
+
352
+ for n in range(len(pointsList)):
353
+ points = pointsList[n]
354
+ transcription = transcriptionsList[n]
355
+
356
+ if evaluationParams['LTRB']:
357
+ detRect = Rectangle(*points)
358
+ detPol = rectangle_to_polygon(detRect)
359
+ else:
360
+ detPol = polygon_from_points(points)
361
+ detPols.append(detPol)
362
+ detPolPoints.append(points)
363
+ detTrans.append(transcription)
364
+
365
+ if len(gtDontCarePolsNum) > 0:
366
+ for dontCarePol in gtDontCarePolsNum:
367
+ dontCarePol = gtPols[dontCarePol]
368
+ intersected_area = get_intersection(dontCarePol, detPol)
369
+ pdDimensions = detPol.area()
370
+ precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions
371
+ if precision > evaluationParams['AREA_PRECISION_CONSTRAINT']:
372
+ detDontCarePolsNum.append(len(detPols) - 1)
373
+ break
374
+
375
+ evaluationLog += 'DET polygons: ' + str(
376
+ len(detPols)
377
+ ) + (' (' + str(len(detDontCarePolsNum)) + " don't care)\n" if len(detDontCarePolsNum) > 0 else '\n')
378
+
379
+ if len(gtPols) > 0 and len(detPols) > 0:
380
+ # Calculate IoU and precision matrixs
381
+ outputShape = [len(gtPols), len(detPols)]
382
+ iouMat = np.empty(outputShape)
383
+ gtRectMat = np.zeros(len(gtPols), np.int8)
384
+ detRectMat = np.zeros(len(detPols), np.int8)
385
+ for gtNum in range(len(gtPols)):
386
+ for detNum in range(len(detPols)):
387
+ pG = gtPols[gtNum]
388
+ pD = detPols[detNum]
389
+ iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG)
390
+
391
+ for gtNum in range(len(gtPols)):
392
+ for detNum in range(len(detPols)):
393
+ if gtRectMat[gtNum] == 0 and detRectMat[
394
+ detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum:
395
+ if iouMat[gtNum, detNum] > evaluationParams['IOU_CONSTRAINT']:
396
+ gtRectMat[gtNum] = 1
397
+ detRectMat[detNum] = 1
398
+ # detection matched only if transcription is equal
399
+ if evaluationParams['WORD_SPOTTING']:
400
+ correct = gtTrans[gtNum].upper() == detTrans[detNum].upper()
401
+ else:
402
+ correct = transcription_match(
403
+ gtTrans[gtNum].upper(), detTrans[detNum].upper(),
404
+ evaluationParams['SPECIAL_CHARACTERS'],
405
+ evaluationParams['ONLY_REMOVE_FIRST_LAST_CHARACTER']
406
+ ) == True
407
+ detCorrect += 1 if correct else 0
408
+ if correct:
409
+ detMatchedNums.append(detNum)
410
+ pairs.append({'gt': gtNum, 'det': detNum, 'correct': correct})
411
+ evaluationLog += 'Match GT #' + str(gtNum) + ' with Det #' + str(
412
+ detNum
413
+ ) + ' trans. correct: ' + str(correct) + '\n'
414
+
415
+ if evaluationParams['CONFIDENCES']:
416
+ for detNum in range(len(detPols)):
417
+ if detNum not in detDontCarePolsNum:
418
+ # we exclude the don't care detections
419
+ match = detNum in detMatchedNums
420
+
421
+ arrSampleConfidences.append(confidencesList[detNum])
422
+ arrSampleMatch.append(match)
423
+
424
+ arrGlobalConfidences.append(confidencesList[detNum])
425
+ arrGlobalMatches.append(match)
426
+
427
+ numGtCare = len(gtPols) - len(gtDontCarePolsNum)
428
+ numDetCare = len(detPols) - len(detDontCarePolsNum)
429
+ if numGtCare == 0:
430
+ recall = float(1)
431
+ precision = float(0) if numDetCare > 0 else float(1)
432
+ sampleAP = precision
433
+ else:
434
+ recall = float(detCorrect) / numGtCare
435
+ precision = 0 if numDetCare == 0 else float(detCorrect) / numDetCare
436
+ if evaluationParams['CONFIDENCES']:
437
+ sampleAP = compute_ap(arrSampleConfidences, arrSampleMatch, numGtCare)
438
+
439
+ hmean = 0 if (precision + recall) == 0 else 2.0 * precision * recall / (precision + recall)
440
+
441
+ matchedSum += detCorrect
442
+ numGlobalCareGt += numGtCare
443
+ numGlobalCareDet += numDetCare
444
+
445
+ perSampleMetrics[resFile] = {
446
+ 'precision': precision,
447
+ 'recall': recall,
448
+ 'hmean': hmean,
449
+ 'pairs': pairs,
450
+ 'AP': sampleAP,
451
+ 'iouMat': [] if len(detPols) > 100 else iouMat.tolist(),
452
+ 'gtPolPoints': gtPolPoints,
453
+ 'detPolPoints': detPolPoints,
454
+ 'gtTrans': gtTrans,
455
+ 'detTrans': detTrans,
456
+ 'gtDontCare': gtDontCarePolsNum,
457
+ 'detDontCare': detDontCarePolsNum,
458
+ 'evaluationParams': evaluationParams,
459
+ 'evaluationLog': evaluationLog,
460
+ }
461
+
462
+ # Compute AP
463
+ AP = 0
464
+ if evaluationParams['CONFIDENCES']:
465
+ AP = compute_ap(arrGlobalConfidences, arrGlobalMatches, numGlobalCareGt)
466
+
467
+ methodRecall = 0 if numGlobalCareGt == 0 else float(matchedSum) / numGlobalCareGt
468
+ methodPrecision = 0 if numGlobalCareDet == 0 else float(matchedSum) / numGlobalCareDet
469
+ methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * methodRecall * methodPrecision / (
470
+ methodRecall + methodPrecision
471
+ )
472
+
473
+ methodMetrics = {'precision': methodPrecision, 'recall': methodRecall, 'hmean': methodHmean, 'AP': AP}
474
+
475
+ resDict = {'calculated': True, 'Message': '', 'method': methodMetrics, 'per_sample': perSampleMetrics}
476
+
477
+ return resDict
478
+
479
+
480
+ if __name__ == '__main__':
481
+ rrc_evaluation_funcs.main_evaluation(None, default_evaluation_params, validate_data, evaluate_method)
@@ -0,0 +1,179 @@
1
+ import ast
2
+ import os
3
+ import re
4
+ import shutil
5
+ import zipfile
6
+
7
+ from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR
8
+ from evalscope.utils.function_utils import thread_safe
9
+ from .spotting_eval import rrc_evaluation_funcs_1_1 as rrc_evaluation_funcs
10
+ from .spotting_eval.script import default_evaluation_params, evaluate_method, validate_data
11
+
12
+
13
+ def extract_bounding_boxes_robust(predict_str):
14
+ """
15
+ Extract coordinates and text content from the given prediction string,
16
+ handling potential format issues.
17
+
18
+ Args:
19
+ predict_str (str): Model prediction output as a string.
20
+
21
+ Returns:
22
+ list: Extracted data in the format [[x1, y1, x2, y2, text_content], ...].
23
+ Returns None if no valid data is extracted.
24
+ """
25
+ results = []
26
+ seen = set()
27
+
28
+ # try parsing with ast.literal_eval
29
+ try:
30
+ data = ast.literal_eval(predict_str)
31
+ except Exception:
32
+ data = None
33
+
34
+ if data is not None:
35
+ if isinstance(data, (list, tuple)):
36
+ for item in data:
37
+ if isinstance(item, (list, tuple)) and len(item) >= 5:
38
+ x1_str, y1_str, x2_str, y2_str = item[:4]
39
+ text_content = item[4]
40
+
41
+ x1_str = str(x1_str).strip()
42
+ y1_str = str(y1_str).strip()
43
+ x2_str = str(x2_str).strip()
44
+ y2_str = str(y2_str).strip()
45
+ text_content = str(text_content).replace('\n', '').strip().strip('"').strip("'")
46
+
47
+ try:
48
+ x1 = int(x1_str)
49
+ y1 = int(y1_str)
50
+ x2 = int(x2_str)
51
+ y2 = int(y2_str)
52
+
53
+ if not (0 <= x1 <= 1000 and 0 <= y1 <= 1000 and 0 <= x2 <= 1000 and 0 <= y2 <= 1000):
54
+ continue
55
+
56
+ key = (x1, y1, x2, y2, text_content)
57
+ if key in seen:
58
+ continue
59
+
60
+ seen.add(key)
61
+ results.append([x1, y1, x2, y2, text_content])
62
+ except ValueError:
63
+ continue
64
+ else:
65
+ # try parsing with regular expression
66
+
67
+ list_content = predict_str
68
+ items = re.findall(r'[\[\(]\s*([^\[\]\(\)]*?)\s*[\]\)]', list_content)
69
+
70
+ if not items:
71
+ return None
72
+
73
+ for item in items:
74
+ parts = item.split(',', 4)
75
+ if len(parts) < 5:
76
+ continue
77
+
78
+ x1_str, y1_str, x2_str, y2_str, text_content = parts
79
+
80
+ x1_str = x1_str.strip()
81
+ y1_str = y1_str.strip()
82
+ x2_str = x2_str.strip()
83
+ y2_str = y2_str.strip()
84
+ text_content = text_content.replace('\n', '').strip().strip('"').strip("'")
85
+
86
+ try:
87
+ x1 = int(x1_str)
88
+ y1 = int(y1_str)
89
+ x2 = int(x2_str)
90
+ y2 = int(y2_str)
91
+
92
+ if not (0 <= x1 <= 1000 and 0 <= y1 <= 1000 and 0 <= x2 <= 1000 and 0 <= y2 <= 1000):
93
+ continue
94
+
95
+ key = (x1, y1, x2, y2, text_content)
96
+ if key in seen:
97
+ continue
98
+
99
+ seen.add(key)
100
+ results.append([x1, y1, x2, y2, text_content])
101
+ except ValueError:
102
+ continue
103
+
104
+ if not results:
105
+ return None
106
+
107
+ return results
108
+
109
+
110
+ def zip_folder(source_folder, destination_zip):
111
+ abs_source = os.path.abspath(source_folder)
112
+ abs_destination = os.path.abspath(destination_zip)
113
+
114
+ with zipfile.ZipFile(abs_destination, 'w', zipfile.ZIP_DEFLATED) as zf:
115
+ for root, _, files in os.walk(abs_source):
116
+ for file in files:
117
+ abs_file_path = os.path.join(root, file)
118
+
119
+ relative_path = os.path.relpath(abs_file_path, abs_source)
120
+ zf.write(abs_file_path, relative_path)
121
+
122
+
123
+ @thread_safe
124
+ def spotting_evaluation(prediction_list, img_metas):
125
+ score = 0
126
+
127
+ submit_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'submit')
128
+ gt_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'gt')
129
+ submit_zip_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'submit.zip')
130
+ gt_zip_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'gt.zip')
131
+ for file_path in [submit_path, gt_path, submit_zip_path, gt_zip_path]:
132
+ if 'zip' in file_path:
133
+ if os.path.exists(file_path):
134
+ os.remove(file_path)
135
+ else:
136
+ if os.path.exists(file_path):
137
+ shutil.rmtree(file_path)
138
+ os.makedirs(file_path, exist_ok=True)
139
+
140
+ res_submit_list = []
141
+ for item in prediction_list:
142
+ x1, y1, x2, y2, rec = item
143
+ if x1 >= x2 or y1 >= y2:
144
+ continue
145
+
146
+ res_submit_list.append(','.join([str(x1), str(y1), str(x2), str(y1), str(x2), str(y2), str(x1), str(y2), rec]))
147
+
148
+ res_gt_list = []
149
+ for bbox, rec in zip(img_metas['bbox_list'], img_metas['content']):
150
+ x_coords = bbox[0::2]
151
+ y_coords = bbox[1::2]
152
+
153
+ x1, y1 = min(x_coords), min(y_coords)
154
+ x2, y2 = max(x_coords), max(y_coords)
155
+
156
+ res_gt_list.append(','.join([str(x1), str(y1), str(x2), str(y1), str(x2), str(y2), str(x1), str(y2), rec]))
157
+
158
+ if len(res_submit_list) == 0 or len(res_gt_list) == 0:
159
+ return 0
160
+
161
+ with open(os.path.join(submit_path, 'res_img_0.txt'), 'w') as f:
162
+ for item in res_submit_list[:-1]:
163
+ f.write(item + '\n')
164
+ f.write(res_submit_list[-1])
165
+
166
+ with open(os.path.join(gt_path, 'gt_img_0.txt'), 'w') as f:
167
+ for item in res_gt_list[:-1]:
168
+ f.write(item + '\n')
169
+ f.write(res_gt_list[-1])
170
+
171
+ zip_folder(submit_path, submit_zip_path)
172
+ zip_folder(gt_path, gt_zip_path)
173
+
174
+ command = {'g': gt_zip_path, 's': submit_zip_path, 'o': DEFAULT_EVALSCOPE_CACHE_DIR, 'p': '{"IOU_CONSTRAINT":0.5}'}
175
+
176
+ # run rrc_evaluation_funcs
177
+ result = rrc_evaluation_funcs.main_evaluation(command, default_evaluation_params, validate_data, evaluate_method)
178
+ score = result['method']['hmean']
179
+ return score