evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,294 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages import ChatMessageUser, ContentText
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.import_utils import check_import
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ PROMPT_TEMPLATE = """
16
+ Translate the following {source_language} sentence into {target_language}:
17
+
18
+ {source_language}: {source_text}
19
+ {target_language}:
20
+ """.strip()
21
+
22
+ LANGUAGE_PAIRS = [
23
+ 'en-ar_eg',
24
+ 'en-ar_sa',
25
+ 'en-bg_bg',
26
+ 'en-bn_in',
27
+ 'en-ca_es',
28
+ 'en-cs_cz',
29
+ 'en-da_dk',
30
+ 'en-de_de',
31
+ 'en-el_gr',
32
+ 'en-es_mx',
33
+ 'en-et_ee',
34
+ 'en-fa_ir',
35
+ 'en-fi_fi',
36
+ 'en-fil_ph',
37
+ 'en-fr_ca',
38
+ 'en-fr_fr',
39
+ 'en-gu_in',
40
+ 'en-he_il',
41
+ 'en-hi_in',
42
+ 'en-hr_hr',
43
+ 'en-hu_hu',
44
+ 'en-id_id',
45
+ 'en-is_is',
46
+ 'en-it_it',
47
+ 'en-ja_jp',
48
+ 'en-kn_in',
49
+ 'en-ko_kr',
50
+ 'en-lt_lt',
51
+ 'en-lv_lv',
52
+ 'en-ml_in',
53
+ 'en-mr_in',
54
+ 'en-nl_nl',
55
+ 'en-no_no',
56
+ 'en-pa_in',
57
+ 'en-pl_pl',
58
+ 'en-pt_br',
59
+ 'en-pt_pt',
60
+ 'en-ro_ro',
61
+ 'en-ru_ru',
62
+ 'en-sk_sk',
63
+ 'en-sl_si',
64
+ 'en-sr_rs',
65
+ 'en-sv_se',
66
+ 'en-sw_ke',
67
+ 'en-sw_tz',
68
+ 'en-ta_in',
69
+ 'en-te_in',
70
+ 'en-th_th',
71
+ 'en-tr_tr',
72
+ 'en-uk_ua',
73
+ 'en-ur_pk',
74
+ 'en-vi_vn',
75
+ 'en-zh_cn',
76
+ 'en-zh_tw',
77
+ 'en-zu_za',
78
+ ]
79
+
80
+ LANGUAGE_BY_CODE = {
81
+ 'ar_eg': 'arabic',
82
+ 'ar_sa': 'arabic',
83
+ 'bg_bg': 'bulgarian',
84
+ 'bn_bd': 'bengali',
85
+ 'bn_in': 'bengali',
86
+ 'ca_es': 'catalan',
87
+ 'cs_cz': 'czech',
88
+ 'da_dk': 'danish',
89
+ 'de_de': 'german',
90
+ 'el_gr': 'greek',
91
+ 'es_mx': 'spanish',
92
+ 'et_ee': 'estonian',
93
+ 'fa_ir': 'farsi',
94
+ 'fi_fi': 'finnish',
95
+ 'fil_ph': 'filipino',
96
+ 'fr_ca': 'french',
97
+ 'fr_fr': 'french',
98
+ 'gu_in': 'gujarati',
99
+ 'he_il': 'hebrew',
100
+ 'hi_in': 'hindi',
101
+ 'hr_hr': 'croatian',
102
+ 'hu_hu': 'hungarian',
103
+ 'id_id': 'indonesian',
104
+ 'is_is': 'icelandic',
105
+ 'it_it': 'italian',
106
+ 'ja_jp': 'japanese',
107
+ 'kn_in': 'kannada',
108
+ 'ko_kr': 'korean',
109
+ 'lt_lt': 'lithuanian',
110
+ 'lv_lv': 'latvian',
111
+ 'ml_in': 'malayalam',
112
+ 'mr_in': 'marathi',
113
+ 'nl_nl': 'dutch',
114
+ 'no_no': 'norwegian',
115
+ 'pa_in': 'punjabi',
116
+ 'pl_pl': 'polish',
117
+ 'pt_br': 'portuguese',
118
+ 'pt_pt': 'portuguese',
119
+ 'ro_ro': 'romanian',
120
+ 'ru_ru': 'russian',
121
+ 'sk_sk': 'slovak',
122
+ 'sl_si': 'slovenian',
123
+ 'sr_rs': 'serbian',
124
+ 'sv_se': 'swedish',
125
+ 'sw_ke': 'swahili',
126
+ 'sw_tz': 'swahili',
127
+ 'ta_in': 'tamil',
128
+ 'te_in': 'telugu',
129
+ 'th_th': 'thai',
130
+ 'tr_tr': 'turkish',
131
+ 'uk_ua': 'ukrainian',
132
+ 'ur_pk': 'urdu',
133
+ 'vi_vn': 'vietnamese',
134
+ 'zh_cn': 'mandarin',
135
+ 'zh_tw': 'mandarin',
136
+ 'zu_za': 'zulu',
137
+ 'en': 'english',
138
+ }
139
+
140
+
141
+ @register_benchmark(
142
+ BenchmarkMeta(
143
+ name='wmt24pp',
144
+ pretty_name='WMT2024++',
145
+ dataset_id='extraordinarylab/wmt24pp',
146
+ tags=[Tags.MULTI_LINGUAL, Tags.MT],
147
+ description=(
148
+ 'WMT2024 news translation benchmark supporting multiple language pairs. '
149
+ 'Each subset represents a specific translation direction'
150
+ ),
151
+ subset_list=LANGUAGE_PAIRS,
152
+ eval_split='test',
153
+ metric_list={
154
+ 'bleu': {},
155
+ 'bert_score': {
156
+ 'model_id_or_path': 'AI-ModelScope/xlm-roberta-large',
157
+ 'model_type': 'xlm-roberta-large'
158
+ },
159
+ 'comet': {
160
+ 'model_id_or_path': 'evalscope/wmt22-comet-da',
161
+ }
162
+ },
163
+ few_shot_num=0,
164
+ prompt_template=PROMPT_TEMPLATE,
165
+ )
166
+ )
167
+ class WMT24PPAdapter(DefaultDataAdapter):
168
+
169
+ def __init__(self, **kwargs: Any) -> None:
170
+ """Initialize adapter and configure dataset subsets."""
171
+ super().__init__(**kwargs)
172
+ self.reformat_subset = True
173
+ self.use_batch_scoring = True # Enable batch scoring
174
+
175
+ if 'comet' in self.metric_list:
176
+ check_import('comet', 'unbabel-comet', raise_error=True, feature_name='COMETScore Metric')
177
+
178
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
179
+ """
180
+ Convert a data record to a Sample object.
181
+ """
182
+ source_text = str(record['source'])
183
+ target_text = str(record['target'])
184
+ language_pair = str(record['language_pair'])
185
+ source_language, target_language = language_pair.split('-')
186
+
187
+ # Format the generation prompt with the text
188
+ input_prompt = self.prompt_template.format(
189
+ source_text=source_text,
190
+ source_language=LANGUAGE_BY_CODE[source_language],
191
+ target_language=LANGUAGE_BY_CODE[target_language],
192
+ )
193
+
194
+ # Create content list for the input
195
+ content_list = [ContentText(text=input_prompt)]
196
+
197
+ return Sample(
198
+ input=[ChatMessageUser(content=content_list)],
199
+ target=target_text,
200
+ subset_key=language_pair,
201
+ metadata={
202
+ 'source_text': source_text,
203
+ 'target_text': target_text,
204
+ 'source_language': source_language,
205
+ 'target_language': target_language,
206
+ },
207
+ )
208
+
209
+ def match_score(
210
+ self,
211
+ original_prediction: str,
212
+ filtered_prediction: str,
213
+ reference: str,
214
+ task_state: TaskState,
215
+ ) -> Score:
216
+ """Compute per-sample translation metrics."""
217
+ # Create a Score object for the current sample
218
+ score = Score(
219
+ prediction=original_prediction,
220
+ extracted_prediction=filtered_prediction,
221
+ value={},
222
+ )
223
+
224
+ # ---- BLEU ----
225
+ if 'bleu' in self.metric_list:
226
+ try:
227
+ from evalscope.metrics import bleu_ngram_one_sample
228
+
229
+ bleu_results = bleu_ngram_one_sample(filtered_prediction, reference)
230
+ score.value.update(bleu_results)
231
+ except Exception as e:
232
+ logger.warning(f'[WMT24PPAdapter] BLEU single-sample calculation failed: {e}')
233
+ return score
234
+
235
+ def batch_match_score(
236
+ self,
237
+ original_predictions: List[str],
238
+ filtered_predictions: List[str],
239
+ references: List[str],
240
+ task_states: List[TaskState],
241
+ ) -> List[Score]:
242
+ """Compute batched translation metrics (BLEU, BERTScore, COMET)."""
243
+ scores: List[Score] = []
244
+ for i in range(len(original_predictions)):
245
+ score = Score(
246
+ extracted_prediction=filtered_predictions[i],
247
+ prediction=original_predictions[i],
248
+ value={},
249
+ )
250
+ scores.append(score)
251
+
252
+ # ---- BLEU (per-sample within batch) ----
253
+ if 'bleu' in self.metric_list:
254
+ try:
255
+ from evalscope.metrics import bleu_ngram_one_sample
256
+
257
+ for i in range(len(scores)):
258
+ bleu_results = bleu_ngram_one_sample(filtered_predictions[i], references[i])
259
+ scores[i].value.update(bleu_results)
260
+ except Exception as e:
261
+ logger.warning(f'[WMT24PPAdapter] BLEU batch calculation failed: {e}')
262
+
263
+ # ---- BERTScore ----
264
+ if 'bert_score' in self.metric_list:
265
+ try:
266
+ from evalscope.metrics.metric import BertScore
267
+
268
+ score_args = self.metric_list.get('bert_score', {})
269
+ bert_scorer = BertScore(**score_args)
270
+ bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
271
+ for i in range(len(scores)):
272
+ scores[i].value.update({'bert_score': bert_score_f1[i]})
273
+ except Exception as e:
274
+ logger.warning(f'[WMT24PPAdapter] BERTScore batch calculation failed: {e}')
275
+
276
+ # ---- COMET ----
277
+ if 'comet' in self.metric_list:
278
+ try:
279
+ from evalscope.metrics.metric import COMETScore
280
+
281
+ score_args = self.metric_list.get('comet', {})
282
+ comet_scorer = COMETScore(**score_args)
283
+ data = [{
284
+ 'src': st.metadata.get('source_text'),
285
+ 'mt': pred,
286
+ 'ref': ref
287
+ } for pred, ref, st in zip(filtered_predictions, references, task_states)]
288
+ comet_scores = comet_scorer.apply(data)
289
+ for i in range(len(scores)):
290
+ scores[i].value.update({'comet': comet_scores[i]})
291
+ except Exception as e:
292
+ logger.warning(f'[WMT24PPAdapter] COMET batch calculation failed: {e}')
293
+
294
+ return scores
File without changes
@@ -0,0 +1,64 @@
1
+ # flake8: noqa: E501
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.io_utils import bytes_to_base64, compress_image_to_limit
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ # 定义提示模板
15
+ PROMPT_TEMPLATE = """{question}
16
+ \n\n\nLet's think step by step and give the final answer in curly braces,
17
+ like this: {{final answer}}"
18
+ """
19
+
20
+ SUBSET_LIST = ['default']
21
+
22
+
23
+ @register_benchmark(
24
+ BenchmarkMeta(
25
+ name='zerobench',
26
+ pretty_name='ZeroBench',
27
+ dataset_id='evalscope/zerobench',
28
+ tags=[Tags.KNOWLEDGE, Tags.QA, Tags.MULTI_MODAL],
29
+ description=
30
+ 'ZeroBench is a challenging visual reasoning benchmark for Large Multimodal Models (LMMs). It consists of a main set of 100 high-quality, manually curated questions covering numerous domains, reasoning types and image type. Questions in ZeroBench have been designed and calibrated to be beyond the capabilities of current frontier models. As such, none of the evaluated models achieves a non-zero pass@1 (with greedy decoding) or 5/5 reliability score.',
31
+ subset_list=SUBSET_LIST,
32
+ metric_list=['acc'],
33
+ eval_split='zerobench',
34
+ train_split='zerobench_subquestions',
35
+ prompt_template=PROMPT_TEMPLATE,
36
+ )
37
+ )
38
+ class ZeroBenchAdapter(VisionLanguageAdapter):
39
+
40
+ def __init__(self, *args, **kwargs):
41
+ super().__init__(*args, **kwargs)
42
+
43
+ self._use_llm_judge = True
44
+
45
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
46
+ question = record['question_text']
47
+ content_list: List[Content] = [ContentText(text=self.prompt_template.format(question=question))]
48
+ image = record['question_images_decoded']
49
+ if len(image) > 0:
50
+ for img in image:
51
+ # Ensure image is under OpenAI's 10MB data-URI limit by compressing if needed
52
+ processed_bytes, fmt = compress_image_to_limit(img['bytes'], 10_000_000)
53
+ image_base64 = bytes_to_base64(processed_bytes, format=fmt, add_header=True)
54
+ content_list.append(ContentImage(image=image_base64))
55
+
56
+ metadata = {
57
+ 'question_id': record['question_id'],
58
+ 'question_images': record['question_images'],
59
+ 'image_attribution': record['image_attribution']
60
+ }
61
+
62
+ return Sample(
63
+ input=[ChatMessageUser(content=content_list)], target=record['question_answer'], metadata=metadata
64
+ )
evalscope/cli/cli.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  import argparse
4
4
 
5
+ from evalscope import __version__
5
6
  from evalscope.cli.start_app import StartAppCMD
6
7
  from evalscope.cli.start_eval import EvalCMD
7
8
  from evalscope.cli.start_perf import PerfBenchCMD
@@ -9,6 +10,7 @@ from evalscope.cli.start_perf import PerfBenchCMD
9
10
 
10
11
  def run_cmd():
11
12
  parser = argparse.ArgumentParser('EvalScope Command Line tool', usage='evalscope <command> [<args>]')
13
+ parser.add_argument('-v', '--version', action='version', version=f'evalscope {__version__}')
12
14
  subparsers = parser.add_subparsers(help='EvalScope command line helper.')
13
15
 
14
16
  PerfBenchCMD.define_args(subparsers)
@@ -3,7 +3,6 @@ import os
3
3
  from argparse import ArgumentParser
4
4
 
5
5
  from evalscope.cli.base import CLICommand
6
- from evalscope.report.app import create_app
7
6
 
8
7
 
9
8
  def subparser_func(args):
@@ -22,8 +21,19 @@ class StartAppCMD(CLICommand):
22
21
  def define_args(parsers: ArgumentParser):
23
22
  """ define args for create pipeline template command.
24
23
  """
24
+ from evalscope.app import add_argument
25
+
25
26
  parser = parsers.add_parser(StartAppCMD.name)
27
+ add_argument(parser)
26
28
  parser.set_defaults(func=subparser_func)
27
29
 
28
30
  def execute(self):
29
- create_app()
31
+ try:
32
+ from evalscope.app import create_app
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import create_app from evalscope.app, due to {e}. '
36
+ "Please run `pip install 'evalscope[app]'`."
37
+ )
38
+
39
+ create_app(self.args)
@@ -1,10 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  from argparse import ArgumentParser
4
3
 
5
- from evalscope.arguments import add_argument
6
4
  from evalscope.cli.base import CLICommand
7
- from evalscope.run import run_task
8
5
 
9
6
 
10
7
  def subparser_func(args):
@@ -23,9 +20,13 @@ class EvalCMD(CLICommand):
23
20
  def define_args(parsers: ArgumentParser):
24
21
  """ define args for create pipeline template command.
25
22
  """
23
+ from evalscope.arguments import add_argument
24
+
26
25
  parser = parsers.add_parser(EvalCMD.name)
27
26
  add_argument(parser)
28
27
  parser.set_defaults(func=subparser_func)
29
28
 
30
29
  def execute(self):
30
+ from evalscope.run import run_task
31
+
31
32
  run_task(self.args)
@@ -3,8 +3,6 @@ import os
3
3
  from argparse import ArgumentParser
4
4
 
5
5
  from evalscope.cli.base import CLICommand
6
- from evalscope.perf.arguments import add_argument
7
- from evalscope.perf.main import run_perf_benchmark
8
6
 
9
7
 
10
8
  def subparser_func(args):
@@ -23,9 +21,19 @@ class PerfBenchCMD(CLICommand):
23
21
  def define_args(parsers: ArgumentParser):
24
22
  """ define args for create pipeline template command.
25
23
  """
24
+ from evalscope.perf.arguments import add_argument
25
+
26
26
  parser = parsers.add_parser(PerfBenchCMD.name)
27
27
  add_argument(parser)
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
+ try:
32
+ from evalscope.perf.main import run_perf_benchmark
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
36
+ "Please run `pip install 'evalscope[perf]'`."
37
+ )
38
+
31
39
  run_perf_benchmark(self.args)
@@ -25,14 +25,16 @@ def add_perf_args(parser):
25
25
  '--logdir',
26
26
  required=True,
27
27
  type=str,
28
- help='The monitor log save dir, tensorboard start at this path for display!')
28
+ help='The monitor log save dir, tensorboard start at this path for display!'
29
+ )
29
30
  parser.add_argument('--host', type=str, default='0.0.0.0', help='The tensorboard host')
30
31
  parser.add_argument('--tensorboard-port', type=str, default='6006', help='The tensorboard port')
31
32
 
32
33
 
33
34
  def async_run_command_with_popen(cmd):
34
35
  sub_process = subprocess.Popen(
35
- cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8')
36
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8'
37
+ )
36
38
  return sub_process
37
39
 
38
40
 
@@ -61,7 +63,8 @@ def start_server(args):
61
63
  bufsize=1,
62
64
  shell=True,
63
65
  universal_newlines=True,
64
- encoding='utf8')
66
+ encoding='utf8'
67
+ )
65
68
 
66
69
  os.set_blocking(sub_process.stdout.fileno(), False)
67
70
  return sub_process
@@ -1,3 +1,27 @@
1
- from evalscope.collections.evaluator import EvaluatorCollection
2
- from evalscope.collections.sampler import StratifiedSampler, UniformSampler, WeightedSampler
3
- from evalscope.collections.schema import CollectionSchema, DatasetInfo
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
3
+
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .sampler import DatasetEntry, StratifiedSampler, UniformSampler, WeightedSampler
8
+ from .schema import CollectionSchema, DatasetInfo
9
+
10
+ else:
11
+ _import_structure = {
12
+ 'sampler': ['StratifiedSampler', 'UniformSampler', 'WeightedSampler', 'DatasetEntry'],
13
+ 'schema': [
14
+ 'CollectionSchema',
15
+ 'DatasetInfo',
16
+ ],
17
+ }
18
+
19
+ import sys
20
+
21
+ sys.modules[__name__] = _LazyModule(
22
+ __name__,
23
+ globals()['__file__'],
24
+ _import_structure,
25
+ module_spec=__spec__,
26
+ extra_objects={},
27
+ )
@@ -1,18 +1,17 @@
1
1
  import random
2
2
  from abc import ABC, abstractmethod
3
- from dataclasses import asdict, dataclass, field
3
+ from pydantic import BaseModel, Field
4
4
  from tqdm import tqdm
5
5
  from typing import List, Optional
6
6
 
7
7
  from evalscope.collections.schema import CollectionSchema, DatasetInfo
8
8
 
9
9
 
10
- @dataclass
11
- class DatasetEntry:
10
+ class DatasetEntry(BaseModel):
12
11
  index: int = 0
13
- prompt: dict = field(default_factory=dict)
14
- tags: List[str] = field(default_factory=list)
15
- categories: List[str] = field(default_factory=list)
12
+ prompt: dict = Field(default_factory=dict)
13
+ tags: List[str] = Field(default_factory=list)
14
+ categories: List[str] = Field(default_factory=list)
16
15
  task_type: str = ''
17
16
  weight: float = 0.0
18
17
  dataset_name: str = ''
@@ -33,25 +32,27 @@ class Sampler(ABC):
33
32
  all_data = []
34
33
  data_dict = dataset.get_data()
35
34
  for subset_name, subset_data in data_dict.items():
36
- for prompt in subset_data:
35
+ for sample in subset_data:
37
36
  all_data.append(
38
37
  DatasetEntry(
39
- prompt=prompt,
38
+ prompt=sample.model_dump(exclude_none=True),
40
39
  tags=dataset.tags,
41
40
  categories=dataset.hierarchy,
42
41
  task_type=dataset.task_type,
43
42
  weight=dataset.weight,
44
43
  dataset_name=dataset.name,
45
44
  subset_name=subset_name,
46
- ))
47
- sampled_data = random.choices(all_data, k=count)
45
+ )
46
+ )
47
+ count = min(count, len(all_data)) # avoid sampling more than the dataset size
48
+ sampled_data = random.sample(all_data, k=count)
48
49
  return sampled_data
49
50
 
50
51
  def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
51
52
  result = []
52
53
  for i, entry in enumerate(all_data):
53
54
  entry.index = i
54
- result.append(asdict(entry))
55
+ result.append(entry.model_dump())
55
56
  return result
56
57
 
57
58
 
@@ -3,6 +3,10 @@ import json
3
3
  from dataclasses import asdict, dataclass, field
4
4
  from typing import List, Union
5
5
 
6
+ from evalscope.api.dataset import DatasetDict
7
+ from evalscope.api.registry import get_benchmark
8
+ from evalscope.config import TaskConfig
9
+
6
10
 
7
11
  @dataclass
8
12
  class DatasetInfo:
@@ -13,16 +17,11 @@ class DatasetInfo:
13
17
  args: dict = field(default_factory=dict)
14
18
  hierarchy: List[str] = field(default_factory=list)
15
19
 
16
- def get_data(self) -> dict:
17
- from evalscope.benchmarks import Benchmark
18
-
19
- benchmark_meta = Benchmark.get(self.name)
20
-
21
- data_adapter = benchmark_meta.get_data_adapter(config=self.args)
22
- data_dict = data_adapter.load(
23
- dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
24
- prompts = data_adapter.gen_prompts(data_dict)
25
- return prompts
20
+ def get_data(self) -> DatasetDict:
21
+ dataset_args = {self.name: self.args}
22
+ benchmark_meta = get_benchmark(self.name, config=TaskConfig(dataset_args=dataset_args))
23
+ data_dict = benchmark_meta.load_dataset()
24
+ return data_dict
26
25
 
27
26
 
28
27
  def flatten_weight(collection: 'CollectionSchema', base_weight=1):
@@ -112,8 +111,10 @@ if __name__ == '__main__':
112
111
  ]),
113
112
  CollectionSchema(
114
113
  name='chinese',
115
- datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})])
116
- ])
114
+ datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})]
115
+ )
116
+ ]
117
+ )
117
118
  print(schema)
118
119
  print(schema.flatten())
119
120
  schema.dump_json('outputs/schema.json')