evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,7 +1,18 @@
1
+ import base64
2
+ import csv
3
+ import hashlib
4
+ import io
1
5
  import json
2
6
  import jsonlines as jsonl
3
7
  import os
8
+ import re
9
+ import string
10
+ import unicodedata
4
11
  import yaml
12
+ from datetime import datetime
13
+ from io import BytesIO
14
+ from PIL import Image
15
+ from typing import Tuple
5
16
 
6
17
  from evalscope.constants import DumpMode
7
18
  from evalscope.utils.logger import get_logger
@@ -27,7 +38,7 @@ class OutputsStructure:
27
38
  'configs_dir': None
28
39
  }
29
40
 
30
- def _get_dir(self, attr_name, dir_name):
41
+ def _get_dir(self, attr_name, dir_name) -> str:
31
42
  if self._dirs[attr_name] is None:
32
43
  dir_path = os.path.join(self.outputs_dir, dir_name)
33
44
  if self.is_make:
@@ -66,10 +77,20 @@ def jsonl_to_list(jsonl_file):
66
77
  Returns:
67
78
  list: list of lines. Each line is a dict.
68
79
  """
69
- res_list = []
70
- with jsonl.open(jsonl_file, mode='r') as reader:
71
- for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
72
- res_list.append(line)
80
+ try:
81
+ res_list = []
82
+ with jsonl.open(jsonl_file, mode='r') as reader:
83
+ for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
84
+ res_list.append(line)
85
+ except Exception:
86
+ # Fallback to reading line by line
87
+ res_list = []
88
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
89
+ for line in f:
90
+ if line.strip(): # Skip empty lines
91
+ res_list.append(json.loads(line.strip()))
92
+ if not res_list:
93
+ logger.warning(f'No data found in {jsonl_file}.')
73
94
  return res_list
74
95
 
75
96
 
@@ -104,6 +125,9 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
104
125
  if not isinstance(data_list, list):
105
126
  data_list = [data_list]
106
127
 
128
+ # Convert non-serializable types to serializable ones
129
+ data_list = convert_normal_types(data_list)
130
+
107
131
  if dump_mode == DumpMode.OVERWRITE:
108
132
  dump_mode = 'w'
109
133
  elif dump_mode == DumpMode.APPEND:
@@ -112,8 +136,76 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
112
136
  writer.write_all(data_list)
113
137
 
114
138
 
115
- def jsonl_to_csv():
116
- pass
139
+ def jsonl_to_csv(jsonl_file, csv_file):
140
+ """
141
+ Convert jsonl file to csv file.
142
+
143
+ Args:
144
+ jsonl_file: jsonl file path.
145
+ csv_file: csv file path.
146
+ """
147
+ data = jsonl_to_list(jsonl_file)
148
+ if not data:
149
+ logger.warning(f'No data found in {jsonl_file}.')
150
+ return
151
+
152
+ with open(csv_file, 'w', newline='', encoding='utf-8') as f:
153
+ writer = csv.writer(f)
154
+ writer.writerow(data[0].keys()) # Write header
155
+ for item in data:
156
+ writer.writerow(item.values())
157
+
158
+
159
+ def csv_to_list(csv_file) -> list:
160
+ """
161
+ Read csv file to list.
162
+
163
+ Args:
164
+ csv_file: csv file path.
165
+
166
+ Returns:
167
+ list: list of lines. Each line is a dict.
168
+ """
169
+ res_list = []
170
+ with open(csv_file, 'r', encoding='utf-8') as f:
171
+ reader = csv.DictReader(f)
172
+ for row in reader:
173
+ res_list.append(row)
174
+ return res_list
175
+
176
+
177
+ def tsv_to_list(tsv_file) -> list:
178
+ """
179
+ Read tsv file to list.
180
+
181
+ Args:
182
+ tsv_file: tsv file path.
183
+
184
+ Returns:
185
+ list: list of lines. Each line is a dict.
186
+ """
187
+ res_list = []
188
+ with open(tsv_file, 'r', encoding='utf-8') as f:
189
+ reader = csv.DictReader(f, delimiter='\t')
190
+ for row in reader:
191
+ res_list.append(row)
192
+ return res_list
193
+
194
+
195
+ def csv_to_jsonl(csv_file, jsonl_file):
196
+ """
197
+ Convert csv file to jsonl file.
198
+
199
+ Args:
200
+ csv_file: csv file path.
201
+ jsonl_file: jsonl file path.
202
+ """
203
+ data = csv_to_list(csv_file)
204
+ if not data:
205
+ logger.warning(f'No data found in {csv_file}.')
206
+ return
207
+
208
+ dump_jsonl_data(data, jsonl_file, dump_mode=DumpMode.OVERWRITE)
117
209
 
118
210
 
119
211
  def yaml_to_dict(yaml_file) -> dict:
@@ -135,7 +227,7 @@ def dict_to_yaml(d: dict, yaml_file: str):
135
227
  Dump dict to yaml file.
136
228
  """
137
229
  with open(yaml_file, 'w') as f:
138
- yaml.dump(d, f, default_flow_style=False)
230
+ yaml.dump(d, f, default_flow_style=False, allow_unicode=True)
139
231
 
140
232
 
141
233
  def json_to_dict(json_file) -> dict:
@@ -168,3 +260,239 @@ def dict_to_json(d: dict, json_file: str):
168
260
  """
169
261
  with open(json_file, 'w') as f:
170
262
  json.dump(d, f, indent=4, ensure_ascii=False)
263
+
264
+
265
+ def get_latest_folder_path(work_dir):
266
+ from datetime import datetime
267
+
268
+ # Get all subdirectories in the work_dir
269
+ folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
270
+
271
+ # Get the timestamp(YYYYMMDD_HHMMSS)
272
+ timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
273
+
274
+ # Filter out the folders
275
+ timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
276
+
277
+ if not timestamped_folders:
278
+ print(f'>> No timestamped folders found in {work_dir}!')
279
+ return None
280
+
281
+ # timestamp parser
282
+ def parse_timestamp(folder_name):
283
+ return datetime.strptime(folder_name, '%Y%m%d_%H%M%S')
284
+
285
+ # Find the latest folder
286
+ latest_folder = max(timestamped_folders, key=parse_timestamp)
287
+
288
+ return os.path.join(work_dir, latest_folder)
289
+
290
+
291
+ def gen_hash(name: str, bits: int = 32):
292
+ return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
293
+
294
+
295
+ def get_valid_list(input_list, candidate_list):
296
+ """
297
+ Get the valid and invalid list from input_list based on candidate_list.
298
+ Args:
299
+ input_list: The input list.
300
+ candidate_list: The candidate list.
301
+
302
+ Returns:
303
+ valid_list: The valid list.
304
+ invalid_list: The invalid list.
305
+ """
306
+ return [i for i in input_list if i in candidate_list], \
307
+ [i for i in input_list if i not in candidate_list]
308
+
309
+
310
+ def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = False) -> str:
311
+ """
312
+ Convert a PIL Image to a base64 encoded string.
313
+
314
+ Args:
315
+ image (Image.Image): The PIL Image to convert.
316
+ format (str): The format to save the image in. Default is 'JPEG'.
317
+ add_header (bool): Whether to add the base64 header. Default is False.
318
+
319
+ Returns:
320
+ str: Base64 encoded string of the image.
321
+ """
322
+ buffered = BytesIO()
323
+ image.save(buffered, format=format)
324
+ img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
325
+ if add_header:
326
+ img_str = f'data:image/{format.lower()};base64,{img_str}'
327
+ return img_str
328
+
329
+
330
+ def bytes_to_base64(bytes_data: bytes, *, format: str = 'png', add_header: bool = False, content_type='image') -> str:
331
+ """Convert bytes to a base64 encoded string.
332
+
333
+ Args:
334
+ bytes_data (bytes): The bytes to convert.
335
+ format (str): The format of the image. Default is 'png'.
336
+ add_header (bool): Whether to add the base64 header. Default is False.
337
+ content_type (str): The type of the data, 'image' or 'audio'. Default is 'image'.
338
+
339
+ Returns:
340
+ str: Base64 encoded string of the bytes.
341
+ """
342
+ base64_str = base64.b64encode(bytes_data).decode('utf-8')
343
+ if add_header:
344
+ base64_str = f'data:{content_type}/{format};base64,{base64_str}'
345
+ return base64_str
346
+
347
+
348
+ def base64_to_PIL(base64_str):
349
+ """Convert a base64 encoded string to a PIL Image.
350
+
351
+ Args:
352
+ base64_str (str): The base64 encoded string.
353
+
354
+ Returns:
355
+ Image.Image: The decoded PIL Image.
356
+ """
357
+ # remove header
358
+ if ',' in base64_str:
359
+ base64_str = base64_str.split(',', 1)[1]
360
+
361
+ # decode
362
+ img_data = base64.b64decode(base64_str)
363
+ img_file = io.BytesIO(img_data)
364
+ img = Image.open(img_file)
365
+ return img
366
+
367
+
368
+ def safe_filename(s: str, max_length: int = 255) -> str:
369
+ """
370
+ Convert a string into a safe filename by removing or replacing unsafe characters.
371
+
372
+ Args:
373
+ s (str): The input string to convert
374
+ max_length (int): Maximum length of the resulting filename (default 255)
375
+
376
+ Returns:
377
+ str: A safe filename string
378
+
379
+ Examples:
380
+ >>> safe_filename("Hello/World?.txt")
381
+ 'Hello_World.txt'
382
+ """
383
+ # normalize unicode characters
384
+ s = unicodedata.normalize('NFKD', s)
385
+ s = s.encode('ASCII', 'ignore').decode('ASCII')
386
+
387
+ # remove or replace unsafe characters
388
+ # Keep only alphanumeric characters, dots, dashes, and underscores
389
+ safe_chars = string.ascii_letters + string.digits + '.-_'
390
+ s = ''.join(c if c in safe_chars else '_' for c in s)
391
+
392
+ # remove consecutive underscores
393
+ s = re.sub(r'_+', '_', s)
394
+
395
+ # remove leading/trailing periods and underscores
396
+ s = s.strip('._')
397
+
398
+ # handle empty string case
399
+ if not s:
400
+ s = 'untitled'
401
+
402
+ # handle starting with a period (hidden files)
403
+ if s.startswith('.'):
404
+ s = '_' + s
405
+
406
+ # enforce length limit
407
+ if len(s) > max_length:
408
+ # If we need to truncate, preserve the file extension if present
409
+ name, ext = os.path.splitext(s)
410
+ ext_len = len(ext)
411
+ if ext_len > 0:
412
+ max_name_length = max_length - ext_len
413
+ s = name[:max_name_length] + ext
414
+ else:
415
+ s = s[:max_length]
416
+
417
+ return s
418
+
419
+
420
+ def convert_normal_types(obj):
421
+ """Recursively convert numpy types and datetime objects to native Python types for JSON serialization."""
422
+ import numpy as np
423
+
424
+ if isinstance(obj, datetime):
425
+ return obj.isoformat()
426
+ elif isinstance(obj, np.bool_):
427
+ return bool(obj)
428
+ elif isinstance(obj, np.integer):
429
+ return int(obj)
430
+ elif isinstance(obj, np.floating):
431
+ return float(obj)
432
+ elif isinstance(obj, np.ndarray):
433
+ return obj.tolist()
434
+ elif isinstance(obj, dict):
435
+ return {key: convert_normal_types(value) for key, value in obj.items()}
436
+ elif isinstance(obj, list):
437
+ return [convert_normal_types(item) for item in obj]
438
+ elif isinstance(obj, tuple):
439
+ return tuple(convert_normal_types(item) for item in obj)
440
+ elif isinstance(obj, os.PathLike):
441
+ return str(obj)
442
+ else:
443
+ return obj
444
+
445
+
446
+ def compress_image_to_limit(image_bytes: bytes, max_bytes: int = 10_000_000) -> Tuple[bytes, str]:
447
+ """
448
+ Ensure image bytes are under max_bytes by re-encoding to JPEG with quality reduction
449
+ and optional downscaling. Returns (processed_bytes, format_str).
450
+ If the original bytes are already below the limit, returns them as PNG.
451
+ """
452
+ if len(image_bytes) <= max_bytes:
453
+ return image_bytes, 'png'
454
+
455
+ try:
456
+ img = Image.open(BytesIO(image_bytes))
457
+ except Exception as exc:
458
+ logger.warning(f'Failed to open image bytes with PIL, sending original image; may exceed API limit: {exc}')
459
+ return image_bytes, 'png'
460
+
461
+ # Convert to RGB for JPEG if needed
462
+ if img.mode not in ('RGB', 'L'):
463
+ img = img.convert('RGB')
464
+
465
+ def encode_jpeg(source: Image.Image, quality: int) -> bytes:
466
+ buf = BytesIO()
467
+ source.save(buf, format='JPEG', quality=quality, optimize=True, progressive=True)
468
+ return buf.getvalue()
469
+
470
+ # Start with moderate quality and reduce
471
+ quality: int = 85
472
+ out: bytes = encode_jpeg(img, quality)
473
+ quality_floor: int = 40
474
+
475
+ while len(out) > max_bytes and quality > quality_floor:
476
+ quality -= 10
477
+ out = encode_jpeg(img, quality)
478
+
479
+ # If still too large, progressively downscale
480
+ min_side_floor: int = 256
481
+ scale: float = 0.9
482
+ while len(out) > max_bytes and min(img.size) > min_side_floor:
483
+ new_w = max(min_side_floor, int(img.width * scale))
484
+ new_h = max(min_side_floor, int(img.height * scale))
485
+ if (new_w, new_h) == img.size:
486
+ break
487
+ img = img.resize((new_w, new_h), Image.LANCZOS)
488
+ out = encode_jpeg(img, quality)
489
+
490
+ if len(out) > max_bytes:
491
+ logger.warning(f'Image remains above limit after compression: size={len(out)} bytes (limit={max_bytes}).')
492
+ else:
493
+ logger.info(
494
+ f'Compressed image from {len(image_bytes)} to {len(out)} bytes; '
495
+ f'quality={quality}, size={img.width}x{img.height}.'
496
+ )
497
+
498
+ return out, 'jpeg'
@@ -0,0 +1,231 @@
1
+ import types
2
+ import typing
3
+ from copy import deepcopy
4
+ from dataclasses import is_dataclass
5
+ from datetime import date, datetime, time
6
+ from enum import EnumMeta
7
+ from pydantic import BaseModel, Field, field_validator, model_validator
8
+ from typing import (
9
+ Any,
10
+ Dict,
11
+ List,
12
+ Literal,
13
+ Optional,
14
+ Set,
15
+ Tuple,
16
+ Type,
17
+ Union,
18
+ cast,
19
+ get_args,
20
+ get_origin,
21
+ get_type_hints,
22
+ is_typeddict,
23
+ )
24
+
25
+ JSONType = Literal['string', 'integer', 'number', 'boolean', 'array', 'object', 'null']
26
+ """Valid types within JSON schema."""
27
+
28
+
29
+ class JSONSchema(BaseModel):
30
+ """JSON Schema for type."""
31
+
32
+ type: Optional[JSONType] = Field(default=None)
33
+ """JSON type of tool parameter."""
34
+
35
+ format: Optional[str] = Field(default=None)
36
+ """Format of the parameter (e.g. date-time)."""
37
+
38
+ description: Optional[str] = Field(default=None)
39
+ """Parameter description."""
40
+
41
+ default: Any = Field(default=None)
42
+ """Default value for parameter."""
43
+
44
+ enum: Optional[List[Any]] = Field(default=None)
45
+ """Valid values for enum parameters."""
46
+
47
+ items: Optional['JSONSchema'] = Field(default=None)
48
+ """Valid type for array parameters."""
49
+
50
+ properties: Optional[Dict[str, 'JSONSchema']] = Field(default=None)
51
+ """Valid fields for object parametrs."""
52
+
53
+ additionalProperties: Optional[Union['JSONSchema', bool]] = Field(default=None)
54
+ """Are additional properties allowed?"""
55
+
56
+ anyOf: Optional[List['JSONSchema']] = Field(default=None)
57
+ """Valid types for union parameters."""
58
+
59
+ required: Optional[List[str]] = Field(default=None)
60
+ """Required fields for object parameters."""
61
+
62
+ @model_validator(mode='before')
63
+ def convert_type_before_validation(cls, values):
64
+ values = deepcopy(values)
65
+
66
+ def recursive_convert_type(obj):
67
+ if isinstance(obj, dict):
68
+ # Convert 'type' field if it's a string
69
+ if 'type' in obj and isinstance(obj['type'], str):
70
+ try:
71
+ obj['type'] = python_type_to_json_type(obj['type'])
72
+ except ValueError:
73
+ # If conversion fails, leave it as is
74
+ pass
75
+ # Recursively process nested structures
76
+ for k, v in obj.items():
77
+ obj[k] = recursive_convert_type(v)
78
+ elif isinstance(obj, list):
79
+ return [recursive_convert_type(item) for item in obj]
80
+ return obj
81
+
82
+ return recursive_convert_type(values)
83
+
84
+
85
+ def json_schema(t: Type[Any]) -> JSONSchema:
86
+ """Provide a JSON Schema for the specified type.
87
+
88
+ Schemas can be automatically inferred for a wide variety of
89
+ Python class types including Pydantic BaseModel, dataclasses,
90
+ and typed dicts.
91
+
92
+ Args:
93
+ t: Python type
94
+
95
+ Returns:
96
+ JSON Schema for type.
97
+ """
98
+ origin = get_origin(t)
99
+ args = get_args(t)
100
+
101
+ if origin is None:
102
+ if t is int:
103
+ return JSONSchema(type='integer')
104
+ elif t is float:
105
+ return JSONSchema(type='number')
106
+ elif t is str:
107
+ return JSONSchema(type='string')
108
+ elif t is bool:
109
+ return JSONSchema(type='boolean')
110
+ elif t is datetime:
111
+ return JSONSchema(type='string', format='date-time')
112
+ elif t is date:
113
+ return JSONSchema(type='string', format='date')
114
+ elif t is time:
115
+ return JSONSchema(type='string', format='time')
116
+ elif t is list or t is set:
117
+ return JSONSchema(type='array', items=JSONSchema())
118
+ elif t is dict:
119
+ return JSONSchema(type='object', additionalProperties=JSONSchema())
120
+ elif (is_dataclass(t) or is_typeddict(t) or (isinstance(t, type) and issubclass(t, BaseModel))):
121
+ return cls_json_schema(t)
122
+ elif isinstance(t, EnumMeta):
123
+ return JSONSchema(enum=[item.value for item in t])
124
+ elif t is type(None):
125
+ return JSONSchema(type='null')
126
+ else:
127
+ return JSONSchema()
128
+ elif (origin is list or origin is List or origin is tuple or origin is Tuple or origin is set or origin is Set):
129
+ return JSONSchema(type='array', items=json_schema(args[0]) if args else JSONSchema())
130
+ elif origin is dict or origin is Dict:
131
+ return JSONSchema(
132
+ type='object',
133
+ additionalProperties=json_schema(args[1]) if len(args) > 1 else JSONSchema(),
134
+ )
135
+ elif origin is Union or origin is types.UnionType:
136
+ return JSONSchema(anyOf=[json_schema(arg) for arg in args])
137
+ elif origin is Optional:
138
+ return JSONSchema(anyOf=[json_schema(arg) for arg in args] + [JSONSchema(type='null')])
139
+ elif origin is typing.Literal:
140
+ return JSONSchema(enum=list(args))
141
+
142
+ return JSONSchema() # Default case if we can't determine the type
143
+
144
+
145
+ def cls_json_schema(cls: Type[Any]) -> JSONSchema:
146
+ properties: Dict[str, JSONSchema] = {}
147
+ required: List[str] = []
148
+
149
+ if is_dataclass(cls):
150
+ fields = cls.__dataclass_fields__ # type: ignore
151
+ for name, field in fields.items():
152
+ properties[name] = json_schema(field.type) # type: ignore
153
+ if field.default == field.default_factory:
154
+ required.append(name)
155
+ elif isinstance(cls, type) and issubclass(cls, BaseModel):
156
+ schema = cls.model_json_schema()
157
+ schema = resolve_schema_references(schema)
158
+ for name, prop in schema.get('properties', {}).items():
159
+ properties[name] = JSONSchema(**prop)
160
+ required = schema.get('required', [])
161
+ elif is_typeddict(cls):
162
+ annotations = get_type_hints(cls)
163
+ for name, type_hint in annotations.items():
164
+ properties[name] = json_schema(type_hint)
165
+ if name in cls.__required_keys__:
166
+ required.append(name)
167
+
168
+ return JSONSchema(
169
+ type='object',
170
+ properties=properties,
171
+ required=required if required else None,
172
+ additionalProperties=False,
173
+ )
174
+
175
+
176
+ def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
177
+ if python_type is not None and python_type in get_args(JSONType):
178
+ return python_type
179
+ if python_type == 'str':
180
+ return 'string'
181
+ elif python_type == 'int':
182
+ return 'integer'
183
+ elif python_type == 'float':
184
+ return 'number'
185
+ elif python_type == 'bool':
186
+ return 'boolean'
187
+ elif python_type == 'list':
188
+ return 'array'
189
+ elif python_type == 'dict':
190
+ return 'object'
191
+ elif python_type == 'None':
192
+ return 'null'
193
+ elif python_type is None:
194
+ # treat 'unknown' as string as anything can be converted to string
195
+ return 'string'
196
+ else:
197
+ raise ValueError(f'Unsupported type: {python_type} for Python to JSON conversion.')
198
+
199
+
200
+ def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
201
+ """Resolves all $ref references in a JSON schema by inlining the definitions."""
202
+ schema = deepcopy(schema)
203
+ definitions = schema.pop('$defs', {})
204
+
205
+ def _resolve_refs(obj: Any) -> Any:
206
+ if isinstance(obj, dict):
207
+ if '$ref' in obj and obj['$ref'].startswith('#/$defs/'):
208
+ ref_key = obj['$ref'].split('/')[-1]
209
+ if ref_key in definitions:
210
+ # Replace with a deep copy of the definition
211
+ resolved = deepcopy(definitions[ref_key])
212
+ # Process any nested references in the definition
213
+ resolved = _resolve_refs(resolved)
214
+
215
+ # Merge in the current object fields, which should take priority
216
+ # This means that if you have e.g.
217
+ # {"$ref": "#/$defs/SubType", "description": "subtype of type SubType"},
218
+ # and SubType resolves to
219
+ # {"description": "The SubType Class", "parameters": {"param1": {"type": "string"}}},
220
+ # the final result will be:
221
+ # {"description": "subtype of type SubType", "parameters": {"param1": {"type": "string"}}}
222
+ return resolved | {k: o for k, o in obj.items() if k != '$ref'}
223
+
224
+ # Process all entries in the dictionary
225
+ return {k: _resolve_refs(v) for k, v in obj.items()}
226
+ elif isinstance(obj, list):
227
+ return [_resolve_refs(item) for item in obj]
228
+ else:
229
+ return obj
230
+
231
+ return cast(Dict[str, Any], _resolve_refs(schema))