evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,370 @@
1
+ import abc
2
+ import random
3
+ from collections import defaultdict
4
+ from dataclasses import dataclass, field
5
+ from pydantic import BaseModel, Field
6
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
7
+
8
+ from evalscope.api.messages import ChatMessage, messages_to_markdown
9
+ from evalscope.api.tool import ToolInfo
10
+
11
+
12
+ class Sample(BaseModel):
13
+ r"""Sample for an evaluation task."""
14
+
15
+ input: Union[str, List[ChatMessage]]
16
+ """The input to be submitted to the model."""
17
+
18
+ choices: Optional[List[str]] = None
19
+ """List of available answer choices (used only for multiple-choice evals)."""
20
+
21
+ target: Union[str, List[str]] = ''
22
+ """Ideal target output. May be a literal value or narrative text to be used by a model grader."""
23
+
24
+ id: Optional[int] = None
25
+ """Unique identifier for sample."""
26
+
27
+ group_id: Optional[int] = None
28
+ """Identifier for the group this sample belongs to, used for grouping k repeated samples."""
29
+
30
+ tools: Optional[List[ToolInfo]] = None
31
+ """List of tools available to the model during inference (optional)."""
32
+
33
+ subset_key: Optional[str] = None
34
+ """Key for the subset this sample belongs to, used for generating subsets (optional)."""
35
+
36
+ metadata: Dict[str, Any] = Field(default_factory=dict)
37
+ """Arbitrary metadata associated with the sample."""
38
+
39
+ sandbox: Optional[str] = None
40
+ """Sandbox environment type and optional config file."""
41
+
42
+ files: Optional[Dict[str, str]] = None
43
+ """Files that go along with the sample (copied to SandboxEnvironment)"""
44
+
45
+ setup: Optional[str] = None
46
+ """Setup script to run for sample (run within default SandboxEnvironment)."""
47
+
48
+ def pretty_print(self) -> str:
49
+ """Return a pretty-printed string representation of the sample."""
50
+ if isinstance(self.input, str):
51
+ input_text = self.input
52
+ else:
53
+ input_text = messages_to_markdown(self.input, max_length=50)
54
+ return f'Sample ID: {self.id}\nInput: {input_text}\nTarget: {self.target}'
55
+
56
+
57
+ @dataclass
58
+ class FieldSpec:
59
+ r"""Specification for mapping data source fields to sample fields."""
60
+
61
+ input: str = field(default='input')
62
+ """Name of the field containing the sample input."""
63
+
64
+ target: str = field(default='target')
65
+ """Name of the field containing the sample target."""
66
+
67
+ choices: str = field(default='choices')
68
+ """Name of field containing the list of answer choices."""
69
+
70
+ id: int = field(default=0)
71
+ """ Unique identifier for the sample."""
72
+
73
+ metadata: Optional[List[str]] = field(default=None)
74
+ """List of additional field names that should be read as metadata."""
75
+
76
+ sandbox: str = field(default='sandbox')
77
+ """Sandbox type along with optional config file."""
78
+
79
+ files: str = field(default='files')
80
+ """Files that go along with the sample."""
81
+
82
+ setup: str = field(default='setup')
83
+ """Setup script to run for sample (run within default SandboxEnvironment)."""
84
+
85
+
86
+ class Dataset(Sequence[Sample], abc.ABC):
87
+ r"""A sequence of Sample objects.
88
+
89
+ Datasets provide sequential access (via conventional indexes or slicing)
90
+ to a collection of Sample objects.
91
+ """
92
+
93
+ @property
94
+ @abc.abstractmethod
95
+ def name(self) -> Optional[str]:
96
+ ...
97
+
98
+ @property
99
+ @abc.abstractmethod
100
+ def location(self) -> Optional[str]:
101
+ ...
102
+
103
+ @property
104
+ @abc.abstractmethod
105
+ def shuffled(self) -> bool:
106
+ ...
107
+
108
+ @abc.abstractmethod
109
+ def __iter__(self) -> Iterator[Sample]:
110
+ """Return an iterator over the samples."""
111
+ ...
112
+
113
+ @abc.abstractmethod
114
+ def __getitem__(self, index: Union[int, slice]) -> Union[Sample, 'Dataset']:
115
+ ...
116
+
117
+ @abc.abstractmethod
118
+ def __len__(self) -> int:
119
+ ...
120
+
121
+ @abc.abstractmethod
122
+ def filter(self, predicate: Callable[[Sample], bool], name: Optional[str] = None) -> 'Dataset':
123
+ """Filter the dataset using a predicate. Only samples matching the predicate will be included.
124
+
125
+ Args:
126
+ predicate: Filtering function.
127
+ name: Name for filtered dataset (optional).
128
+
129
+ Returns:
130
+ Filtered dataset.
131
+ """
132
+ ...
133
+
134
+ @abc.abstractmethod
135
+ def shuffle(self, seed: Optional[int] = None) -> None:
136
+ """Shuffle the order of the dataset (in place).
137
+
138
+ Args:
139
+ seed: Random seed for shuffling (optional).
140
+ """
141
+ ...
142
+
143
+ @abc.abstractmethod
144
+ def shuffle_choices(self, seed: Optional[int] = None) -> None:
145
+ """Shuffle the order of the choices with each sample.
146
+
147
+ Args:
148
+ seed: Random seed for shuffling (optional).
149
+ """
150
+ ...
151
+
152
+ @abc.abstractmethod
153
+ def reindex(self, group_size=1):
154
+ """Reindex the dataset samples to ensure consistent ordering.
155
+
156
+ Args:
157
+ group_size: Number of samples per group for setting group_id.
158
+ """
159
+ ...
160
+
161
+
162
+ class MemoryDataset(Dataset):
163
+ r"""A Dataset stored in memory."""
164
+
165
+ def __init__(
166
+ self,
167
+ samples: List[Sample],
168
+ name: Optional[str] = None,
169
+ location: Optional[str] = None,
170
+ shuffled: bool = False,
171
+ ) -> None:
172
+ r"""A dataset of samples held in an in-memory list.
173
+
174
+ Datasets provide sequential access (via conventional indexes or slicing)
175
+ to a collection of Sample objects. The ListDataset is explicitly
176
+ initialized with a list that is held in memory.
177
+
178
+ Args:
179
+ samples (List[Sample]): The list of sample objects.
180
+ name (str | None): Optional name for dataset.
181
+ location (str | None): Optional location for dataset.
182
+ shuffled (bool): Was the dataset shuffled after reading.
183
+ """
184
+ self.samples = samples
185
+ self._name = name
186
+ self._location = location
187
+ self._shuffled = shuffled
188
+
189
+ @property
190
+ def name(self) -> Optional[str]:
191
+ """Dataset name."""
192
+ return self._name
193
+
194
+ @property
195
+ def location(self) -> Optional[str]:
196
+ """Dataset location."""
197
+ return self._location
198
+
199
+ @property
200
+ def shuffled(self) -> bool:
201
+ """Was the dataset shuffled."""
202
+ return self._shuffled
203
+
204
+ def __iter__(self) -> Iterator[Sample]:
205
+ return iter(self.samples)
206
+
207
+ def __getitem__(self, index: Union[int, slice]) -> Union[Sample, Dataset]:
208
+ if isinstance(index, int):
209
+ return self.samples[index]
210
+ else:
211
+ return MemoryDataset(
212
+ samples=self.samples[index],
213
+ name=self.name,
214
+ location=self.location,
215
+ shuffled=self.shuffled,
216
+ )
217
+
218
+ def __len__(self) -> int:
219
+ return len(self.samples)
220
+
221
+ def shuffle(self, seed: Optional[int] = None) -> None:
222
+ if seed is not None:
223
+ random.Random(seed).shuffle(self.samples)
224
+ else:
225
+ random.shuffle(self.samples)
226
+ self._shuffled = True
227
+
228
+ def shuffle_choices(self, seed: Optional[int] = None) -> None:
229
+ from evalscope.utils.multi_choices import answer_character
230
+
231
+ rand = random.Random(seed)
232
+ for sample in self.samples:
233
+ if not sample.choices:
234
+ continue
235
+ # The original positions
236
+ positions = list(range(len(sample.choices)))
237
+
238
+ # Shuffle the choices
239
+ rand.shuffle(positions)
240
+ shuffled_choices = [sample.choices[i] for i in positions]
241
+
242
+ # Map of original position / target letter
243
+ position_map = {i: answer_character(new_i) for new_i, i in enumerate(positions)}
244
+
245
+ # Update to the shuffled choices and target
246
+ sample.choices = shuffled_choices
247
+ sample.target = self._remap_target(sample.target, position_map=position_map)
248
+
249
+ def _remap_target(self, target: Union[str, List[str]], position_map: Dict[int, str]) -> Union[str, List[str]]:
250
+ from evalscope.utils.multi_choices import answer_index
251
+
252
+ if isinstance(target, list):
253
+ return [position_map[answer_index(t)] for t in target]
254
+ else:
255
+ return position_map[answer_index(target)]
256
+
257
+ def filter(self, predicate: Callable[[Sample], bool], name: Optional[str] = None) -> 'MemoryDataset':
258
+ return MemoryDataset(
259
+ name=name or self.name,
260
+ location=self.location,
261
+ samples=[sample for sample in self.samples if predicate(sample)],
262
+ shuffled=self.shuffled,
263
+ )
264
+
265
+ def reindex(self, group_size=1):
266
+ # Reindex the dataset samples to ensure consistent ordering
267
+ for i, sample in enumerate(self.samples):
268
+ sample.id = i
269
+ sample.group_id = i // group_size
270
+
271
+
272
+ class DatasetDict:
273
+ """
274
+ A dictionary-like container for datasets.
275
+ """
276
+
277
+ def __init__(self, datasets: Dict[str, Dataset]):
278
+ self.datasets = datasets
279
+
280
+ def __getitem__(self, key: str) -> Dataset:
281
+ return self.datasets[key]
282
+
283
+ def __setitem__(self, key: str, value: Dataset) -> None:
284
+ self.datasets[key] = value
285
+
286
+ def __delitem__(self, key: str) -> None:
287
+ del self.datasets[key]
288
+
289
+ def get(self, key: str, default: Optional[Dataset] = None) -> Optional[Dataset]:
290
+ return self.datasets.get(key, default)
291
+
292
+ def items(self):
293
+ return self.datasets.items()
294
+
295
+ def keys(self):
296
+ return self.datasets.keys()
297
+
298
+ def values(self):
299
+ return self.datasets.values()
300
+
301
+ def __len__(self) -> int:
302
+ return len(self.datasets)
303
+
304
+ @classmethod
305
+ def from_dataset(
306
+ cls,
307
+ dataset: Dataset,
308
+ subset_list: List[str],
309
+ limit: Optional[Union[int, float]] = None,
310
+ repeats: int = 1
311
+ ) -> 'DatasetDict':
312
+ """
313
+ Create a DatasetDict from a single Dataset using subset key in the sample.
314
+
315
+ Args:
316
+ dataset (Dataset): The dataset to wrap in a DatasetDict.
317
+ subset_list (List[str]): List of subset keys to include.
318
+ limit (int | float | None): Optional limit on number of samples per subset.
319
+ If int, limits to that many samples. If float, limits to that fraction of samples.
320
+
321
+ Returns:
322
+ DatasetDict: A new DatasetDict containing the provided dataset.
323
+ """
324
+ data_dict = defaultdict(list)
325
+ dataset_dict = defaultdict(list)
326
+ # init subset keys to prevent order issues
327
+ for key in subset_list:
328
+ data_dict[key] = []
329
+ dataset_dict[key] = []
330
+
331
+ # Loop through each sample in the dataset
332
+ for sample in dataset.samples:
333
+ subset_key = sample.subset_key or 'default'
334
+ data_dict[subset_key].append(sample)
335
+ # Create a MemoryDataset for each subset key
336
+ for key, samples in data_dict.items():
337
+ if key not in subset_list:
338
+ continue
339
+ # Apply limit if specified
340
+ if limit is not None:
341
+ if isinstance(limit, float):
342
+ limit = int(len(samples) * limit)
343
+ total_limit = limit * repeats
344
+ samples = samples[:total_limit]
345
+ cur_dataset = MemoryDataset(samples, name=dataset.name)
346
+ # Reindex the dataset to ensure consistent IDs and group IDs
347
+ cur_dataset.reindex(group_size=repeats)
348
+ dataset_dict[key] = cur_dataset
349
+ return cls(dataset_dict)
350
+
351
+ @classmethod
352
+ def from_dataset_dicts(cls, dataset_dicts: List['DatasetDict']) -> 'DatasetDict':
353
+ """
354
+ Create a DatasetDict by merging multiple DatasetDicts.
355
+
356
+ Args:
357
+ dataset_dicts (List[DatasetDict]): List of DatasetDicts to merge.
358
+
359
+ Returns:
360
+ DatasetDict: A new DatasetDict containing the merged datasets.
361
+ """
362
+ merged_dict = defaultdict(list)
363
+ for dataset_dict in dataset_dicts:
364
+ for key, dataset in dataset_dict.items():
365
+ merged_dict[key].extend(dataset.samples)
366
+ # Create a MemoryDataset for each subset key
367
+ final_dict = {}
368
+ for key, samples in merged_dict.items():
369
+ final_dict[key] = MemoryDataset(samples, name=key)
370
+ return cls(final_dict)
@@ -0,0 +1,266 @@
1
+ import copy
2
+ import os
3
+ import random
4
+ from abc import ABC, abstractmethod
5
+ from pathlib import Path
6
+ from typing import Callable, Dict, List, Optional, Union
7
+
8
+ from evalscope.api.dataset.utils import record_to_sample_fn
9
+ from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, HubType
10
+ from evalscope.utils import get_logger
11
+ from evalscope.utils.io_utils import csv_to_list, gen_hash, jsonl_to_list, safe_filename, tsv_to_list
12
+ from .dataset import Dataset, FieldSpec, MemoryDataset, Sample
13
+ from .utils import data_to_samples, shuffle_choices_if_requested
14
+
15
+ logger = get_logger()
16
+
17
+
18
+ class DataLoader(ABC):
19
+ """
20
+ Abstract base class for data loaders.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ data_id_or_path: str,
26
+ split: str,
27
+ sample_fields: Union[FieldSpec, Callable] = None,
28
+ filter_func: Callable = None,
29
+ subset: str = 'default',
30
+ version: str = None,
31
+ limit: Union[int, float] = None,
32
+ data_source: Optional[str] = None,
33
+ shuffle: bool = False,
34
+ shuffle_choices: Optional[Union[bool, int]] = None,
35
+ seed: Optional[int] = None,
36
+ auto_id: bool = True,
37
+ repeats: int = 1,
38
+ trust_remote: bool = True,
39
+ **kwargs
40
+ ):
41
+ self.data_id_or_path = data_id_or_path
42
+ self.split = split
43
+ self.sample_fields = sample_fields
44
+ self.filter_func = filter_func
45
+ self.subset = subset
46
+ self.version = version
47
+ self.limit = limit
48
+ self.data_source = data_source
49
+ self.shuffle = shuffle
50
+ self.shuffle_choices = shuffle_choices
51
+ self.seed = seed
52
+ self.auto_id = auto_id
53
+ self.repeats = repeats
54
+ self.trust_remote = trust_remote
55
+ self.kwargs = kwargs
56
+
57
+ @abstractmethod
58
+ def load(self) -> Dataset:
59
+ """
60
+ Load data from the source.
61
+ """
62
+ ...
63
+
64
+
65
+ class RemoteDataLoader(DataLoader):
66
+ """
67
+ Data loader for remote datasets: ModelScope or Huggingface.
68
+ """
69
+
70
+ def load(self) -> Dataset:
71
+ import datasets
72
+ from modelscope import MsDataset
73
+
74
+ path = self.data_id_or_path
75
+ # resolve data_to_sample function
76
+ data_to_sample = record_to_sample_fn(self.sample_fields)
77
+ # generate a unique cache dir for this dataset
78
+ dataset_hash = gen_hash(f'{path}{self.split}{self.subset}{self.version}{self.kwargs}')
79
+ datasets_cache_dir = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'datasets')
80
+ dataset_cache_dir = os.path.join(datasets_cache_dir, f'{safe_filename(path)}-{dataset_hash}')
81
+ if os.path.exists(dataset_cache_dir):
82
+ dataset = datasets.load_from_disk(dataset_cache_dir)
83
+ else:
84
+ logger.info(
85
+ f'Loading dataset {path} from {self.data_source} > subset: {self.subset} > split: {self.split} ...'
86
+ )
87
+ if self.data_source == HubType.MODELSCOPE:
88
+ dataset = MsDataset.load(
89
+ dataset_name=path,
90
+ split=self.split,
91
+ subset_name=self.subset,
92
+ version=self.version,
93
+ trust_remote_code=self.trust_remote,
94
+ **self.kwargs,
95
+ )
96
+ # convert to Huggingface dataset if necessary
97
+ if not isinstance(dataset, datasets.Dataset):
98
+ dataset = dataset.to_hf_dataset()
99
+ elif self.data_source in [HubType.HUGGINGFACE, HubType.LOCAL]:
100
+ # remove dataset_infos.json file if exists, since datasets will occur an error if it exists.
101
+ dataset_infos_path = os.path.join(path, 'dataset_infos.json')
102
+ if os.path.exists(dataset_infos_path):
103
+ logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid datasets errors.')
104
+ os.remove(dataset_infos_path)
105
+ # load dataset from Huggingface or local path
106
+ dataset = datasets.load_dataset(
107
+ path=path,
108
+ name=self.subset if self.subset != 'default' else None,
109
+ split=self.split,
110
+ revision=self.version,
111
+ trust_remote_code=self.trust_remote,
112
+ **self.kwargs,
113
+ )
114
+
115
+ # Only save to disk if not loading from local path
116
+ if self.data_source != HubType.LOCAL:
117
+ dataset.save_to_disk(dataset_cache_dir)
118
+
119
+ # shuffle if requested
120
+ if self.shuffle:
121
+ dataset = dataset.shuffle(seed=self.seed)
122
+
123
+ # limit if requested
124
+ if self.limit:
125
+ if isinstance(self.limit, float):
126
+ self.limit = int(len(dataset) * self.limit)
127
+ elif isinstance(self.limit, int) and self.limit < 0:
128
+ raise ValueError('Limit must be a non-negative integer or a float between 0 and 1.')
129
+ if len(dataset) > self.limit:
130
+ dataset = dataset.select(range(self.limit))
131
+
132
+ # convert to list
133
+ dataset = dataset.to_list()
134
+
135
+ # repeat k times
136
+ if self.repeats > 1:
137
+ dataset = [copy.deepcopy(item) for item in dataset for _ in range(self.repeats)]
138
+
139
+ # return the dataset
140
+ memory_dataset = MemoryDataset(
141
+ samples=data_to_samples(data=dataset, data_to_sample=data_to_sample),
142
+ name=Path(path).stem if Path(path).exists() else path,
143
+ location=path,
144
+ )
145
+
146
+ # Apply filtering if a filter function is provided
147
+ if self.filter_func is not None:
148
+ memory_dataset = memory_dataset.filter(self.filter_func)
149
+
150
+ # assign ids and group_ids if requested
151
+ if self.auto_id:
152
+ memory_dataset.reindex(group_size=self.repeats)
153
+
154
+ shuffle_choices_if_requested(memory_dataset, self.shuffle_choices)
155
+
156
+ return memory_dataset
157
+
158
+
159
+ class LocalDataLoader(DataLoader):
160
+ """
161
+ Data loader for local datasets. Reads from JSONL or CSV files.
162
+ """
163
+
164
+ def load(self):
165
+
166
+ path = self.data_id_or_path
167
+ data_to_sample = record_to_sample_fn(self.sample_fields)
168
+ dataset = []
169
+
170
+ # Check for JSONL or CSV files in the specified path
171
+ for ext, loader in [
172
+ ('.jsonl', jsonl_to_list),
173
+ ('.csv', csv_to_list),
174
+ ('.tsv', tsv_to_list),
175
+ ]:
176
+ # Check if the file exists with the given extension
177
+ if os.path.isfile(path) and path.endswith(ext):
178
+ file_paths = [path]
179
+ else:
180
+ file_paths = [
181
+ os.path.join(path, f'{self.subset}_{self.split}{ext}'),
182
+ os.path.join(path, f'{self.subset}{ext}')
183
+ ]
184
+ # If the file exists, load it
185
+ for file_path in file_paths:
186
+ if os.path.exists(file_path):
187
+ dataset = loader(file_path)
188
+ break # Stop checking other extensions once a file is found
189
+
190
+ # shuffle if requested
191
+ if self.shuffle:
192
+ random.shuffle(dataset, self.seed)
193
+
194
+ # limit if requested
195
+ if self.limit:
196
+ if isinstance(self.limit, float):
197
+ self.limit = int(len(dataset) * self.limit)
198
+ elif isinstance(self.limit, int) and self.limit < 0:
199
+ raise ValueError('Limit must be a non-negative integer or a float between 0 and 1.')
200
+ dataset = dataset[:self.limit]
201
+
202
+ # repeat k times
203
+ if self.repeats > 1:
204
+ dataset = [copy.deepcopy(item) for item in dataset for _ in range(self.repeats)]
205
+
206
+ # return the dataset
207
+ memory_dataset = MemoryDataset(
208
+ samples=data_to_samples(data=dataset, data_to_sample=data_to_sample),
209
+ name=Path(path).stem if Path(path).exists() else path,
210
+ location=path,
211
+ )
212
+
213
+ # Apply filtering if a filter function is provided
214
+ if self.filter_func is not None:
215
+ memory_dataset = memory_dataset.filter(self.filter_func)
216
+
217
+ # assign ids and group_ids if requested
218
+ if self.auto_id:
219
+ memory_dataset.reindex(group_size=self.repeats)
220
+
221
+ shuffle_choices_if_requested(memory_dataset, self.shuffle_choices)
222
+
223
+ return memory_dataset
224
+
225
+
226
+ class DictDataLoader(DataLoader):
227
+ """Load dataset from a list of dictionaries."""
228
+
229
+ def __init__(self, dict_list: list, **kwargs):
230
+ super().__init__(data_id_or_path='', split='', **kwargs)
231
+ self.dict_list = dict_list
232
+
233
+ def load(self) -> Dataset:
234
+ data_to_sample = record_to_sample_fn(self.sample_fields)
235
+ dataset = self.dict_list
236
+
237
+ # shuffle if requested
238
+ if self.shuffle:
239
+ random.shuffle(dataset, self.seed)
240
+
241
+ # limit if requested
242
+ if self.limit:
243
+ if isinstance(self.limit, float):
244
+ self.limit = int(len(dataset) * self.limit)
245
+ elif isinstance(self.limit, int) and self.limit < 0:
246
+ raise ValueError('Limit must be a non-negative integer or a float between 0 and 1.')
247
+ dataset = dataset[:self.limit]
248
+
249
+ # repeat k times
250
+ if self.repeats > 1:
251
+ dataset = [copy.deepcopy(item) for item in dataset for _ in range(self.repeats)]
252
+
253
+ # return the dataset
254
+ memory_dataset = MemoryDataset(samples=data_to_samples(data=dataset, data_to_sample=data_to_sample), )
255
+
256
+ # Apply filtering if a filter function is provided
257
+ if self.filter_func is not None:
258
+ memory_dataset = memory_dataset.filter(self.filter_func)
259
+
260
+ # assign ids and group_ids if requested
261
+ if self.auto_id:
262
+ memory_dataset.reindex(group_size=self.repeats)
263
+
264
+ shuffle_choices_if_requested(memory_dataset, self.shuffle_choices)
265
+
266
+ return memory_dataset