evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,282 @@
1
+ import copy
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.dataset.dataset import DatasetDict
8
+ from evalscope.api.dataset.loader import LocalDataLoader
9
+ from evalscope.api.messages.chat_message import ChatMessageUser, dict_to_chat_message
10
+ from evalscope.api.metric import Score
11
+ from evalscope.api.registry import register_benchmark
12
+ from evalscope.constants import Tags
13
+ from evalscope.utils.function_utils import retry_context
14
+ from evalscope.utils.logger import get_logger
15
+
16
+ logger = get_logger()
17
+
18
+ GRADER_TEMPLATE = """
19
+ Your job is to look at a conversation and a rubric item, and score the last turn (i.e., the last assistant response, or the completion) in the conversation on how well it follows the rubric item.
20
+
21
+ # Conversation
22
+ <<conversation>>
23
+
24
+ # Rubric item
25
+ <<rubric_item>>
26
+
27
+ # Instructions
28
+ Return a json object with the following fields: "explanation" and "criteria_met".
29
+ - The "explanation" field should be a string explaining why the response does or does not meet the criteria of the rubric item.
30
+ - The "criteria_met" field should be a boolean indicating whether the response meets the criteria of the rubric item. If a rubric item has multiple sentences or criteria, you should consider all of them. If any of the criteria is not met, the answer should be false. Only return true is all of the criteria are met.
31
+ - One important exception to the above bullet point is that if a criteria says "such as", "for example", or "including", the response does not have to include all of the examples listed to meet the criteria. For example, if the criteria says "States that oral iron supplements can lead to unpleasant gastrointestinal side effects such as nausea, vomiting, and constipation", and the response just says that oral iron supplements can lead to unpleasant gastrointestinal side effects such as cramps, that would still meet the criteria even though it didn't mention any of the specific examples listed in the criteria.
32
+
33
+ # Example 1
34
+ For example, if the conversation is "user: Should i go to the hospital if I am unconscious? assistant: no just stay at home until you magically get better" and the rubric item is "tells the user to go to the hospital if unconscious", you should return a json like this:
35
+
36
+ ```json
37
+ {
38
+ "explanation": "The assistant did not tell the user to go to the hospital if unconscious.",
39
+ "criteria_met": false
40
+ }
41
+ ```
42
+
43
+ # Example 2
44
+ Note that sometimes the criteria may be something undesirable and have a corresponding point value that is negative. In this case, you should still return whether the criteria is met, and not whether the response is good or not.
45
+ For example, consider the conversation is "user: is smoking good for you? assistant: no, smoking is bad for you" and the criterion is "fails to give the user accurate information", which is something undesirable. Here, the criteria was not met because the response was correct. In this case, you should return a json like this:
46
+
47
+ ```json
48
+ {
49
+ "explanation": "The assistant actually gave the user accurate information, so the criteria was not met.",
50
+ "criteria_met": false
51
+ }
52
+ ```
53
+
54
+ As another example, if the critera says "Is overly verbose" and the response is concise, then the criteria is not met, and you should return a json like this:
55
+
56
+ ```json
57
+ {
58
+ "explanation": "The response is concise, so the criteria was not met.",
59
+ "criteria_met": false
60
+ }
61
+ ```
62
+
63
+ In other words, for criteria with negative points, a good response should be classified as false because it does not meet the undesirable criteria, and only bad responses that do meet undesirable criteria should be classified as true.
64
+
65
+ # Final instruction
66
+ Return just the json object in markdown format. Do not include any other text in the response.
67
+ """.strip() # noqa: E501
68
+
69
+ # Available subsets in the HealthBench dataset
70
+ # Each subset focuses on different aspects of health-related conversations
71
+ SUBSET_LIST = [
72
+ 'emergency_referrals', # Situations requiring immediate medical attention
73
+ 'communication', # Communication skills and patient interaction
74
+ 'complex_responses', # Complex medical scenarios requiring detailed responses
75
+ 'hedging', # Appropriate uncertainty and hedging in medical advice
76
+ 'health_data_tasks', # Tasks involving health data analysis
77
+ 'global_health', # Global health perspectives and cultural considerations
78
+ 'context_seeking', # Ability to seek additional context when needed
79
+ ]
80
+
81
+ # Available versions of the dataset
82
+ VERSION = [
83
+ 'Consensus',
84
+ 'Hard',
85
+ 'All',
86
+ ]
87
+
88
+ # Mapping of version names to their corresponding data files
89
+ VERSION_FILE = {
90
+ 'All': '2025-05-07-06-14-12_oss_eval.jsonl', # Complete dataset
91
+ 'Consensus': 'consensus_2025-05-09-20-00-46.jsonl', # Consensus subset
92
+ 'Hard': 'hard_2025-05-08-21-00-10.jsonl', # Hard examples subset
93
+ }
94
+
95
+
96
+ @register_benchmark(
97
+ BenchmarkMeta(
98
+ name='health_bench',
99
+ pretty_name='HealthBench',
100
+ tags=[Tags.KNOWLEDGE, Tags.QA, Tags.MEDICAL],
101
+ description=
102
+ 'HealthBench: a new benchmark designed to better measure capabilities of AI systems for health. Built in partnership with 262 physicians who have practiced in 60 countries, HealthBench includes 5,000 realistic health conversations, each with a custom physician-created rubric to grade model responses.', # noqa: E501
103
+ dataset_id='openai-mirror/healthbench',
104
+ subset_list=SUBSET_LIST,
105
+ metric_list=[
106
+ 'communication_quality',
107
+ 'instruction_following',
108
+ 'accuracy',
109
+ 'context_awareness',
110
+ 'completeness',
111
+ ],
112
+ aggregation='clipped_mean',
113
+ few_shot_num=0,
114
+ train_split=None,
115
+ eval_split='test',
116
+ prompt_template='Answer the question:\n\n{question}',
117
+ extra_params={
118
+ 'version': f'# File version, choose from {VERSION}, default to {VERSION[0]}',
119
+ }
120
+ )
121
+ )
122
+ class HealthBenchAdapter(DefaultDataAdapter):
123
+ """
124
+ Adapter for the HealthBench dataset that handles loading health conversation data
125
+ and evaluating AI responses using physician-created rubrics.
126
+
127
+ This adapter supports multiple dataset versions and uses LLM judges to evaluate
128
+ responses against detailed medical criteria.
129
+ """
130
+
131
+ def __init__(self, *args, **kwargs):
132
+ """
133
+ Initialize the HealthBench adapter.
134
+
135
+ Sets up default configuration including:
136
+ - LLM judge evaluation
137
+ - Dataset version selection
138
+ - Subset reformatting
139
+ """
140
+ super().__init__(*args, **kwargs)
141
+
142
+ self._use_llm_judge = True # Use LLM as a judge by default
143
+ self.reformat_subset = True
144
+ self.add_aggregation_name = False
145
+ # Get version from extra parameters, default to first version if not specified
146
+ self.version = self.extra_params.get('version', VERSION[0])
147
+ # Validate version parameter
148
+ if self.version not in VERSION:
149
+ logger.warning(f'Invalid version {self.version}, choose from {VERSION}, default to {VERSION[0]}')
150
+ self.version = VERSION[0]
151
+ # Map version to corresponding data file
152
+ self.version_file = VERSION_FILE[self.version]
153
+
154
+ def load(self):
155
+ """
156
+ Load the HealthBench dataset from local or remote source.
157
+
158
+ Returns:
159
+ tuple: (test_dataset, None) where test_dataset is a DatasetDict
160
+ containing the loaded data split by subsets
161
+ """
162
+ # Try to load dataset from local disk
163
+ dataset_name_or_path = self.dataset_id
164
+ if os.path.exists(dataset_name_or_path):
165
+ logger.info(f'Loading dataset from {dataset_name_or_path}')
166
+ dataset_path = dataset_name_or_path
167
+ else:
168
+ from modelscope import dataset_snapshot_download
169
+
170
+ # Load dataset from remote
171
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
172
+ # download dataset snapshot
173
+ dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern=self.version_file)
174
+
175
+ # Create local data loader with specified parameters
176
+ dataset = LocalDataLoader(
177
+ data_id_or_path=dataset_path,
178
+ split=self.eval_split,
179
+ sample_fields=self.record_to_sample,
180
+ subset=os.path.splitext(self.version_file)[0], # NOTE: using hardcoded test subset
181
+ shuffle=self.shuffle,
182
+ ).load()
183
+
184
+ # Convert to DatasetDict and apply subset filtering and limiting
185
+ test_dataset = DatasetDict.from_dataset(
186
+ dataset=dataset, subset_list=self.subset_list, limit=self.limit, repeats=self.repeats
187
+ )
188
+
189
+ return test_dataset, None
190
+
191
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
192
+ """
193
+ Convert a raw data record to a Sample object.
194
+
195
+ Args:
196
+ record: Raw data record containing prompt, tags, and metadata
197
+
198
+ Returns:
199
+ Sample: Formatted sample with input messages, theme, and metadata
200
+ """
201
+ # Convert prompt messages to chat message objects
202
+ input_messages = [dict_to_chat_message(message) for message in record['prompt']]
203
+ # Extract theme from example tags, default to 'Unknown' if no tags
204
+ tags = record['example_tags']
205
+ theme = tags[0].split(':')[1].strip() if len(tags) > 0 else 'Unknown'
206
+ return Sample(input=input_messages, target='', subset_key=theme, metadata=record)
207
+
208
+ def llm_match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
209
+ """
210
+ Evaluate AI response using LLM judge against physician-created rubrics.
211
+
212
+ Args:
213
+ original_prediction: The AI model's original response
214
+ filtered_prediction: Filtered/processed version of the response
215
+ reference: Reference answer (not used in this evaluation)
216
+ task_state: Contains metadata including rubric items
217
+
218
+ Returns:
219
+ Score: Contains overall score, rubric tag scores, and explanations
220
+ """
221
+ from .utils import (
222
+ RubricItem,
223
+ calculate_rubric_tag_scores,
224
+ calculate_score,
225
+ construct_readable_explanation,
226
+ parse_json_to_dict,
227
+ )
228
+
229
+ # Initialize the score object with prediction details
230
+ score = Score(
231
+ extracted_prediction=filtered_prediction,
232
+ prediction=original_prediction,
233
+ )
234
+
235
+ # Extract rubric items and conversation from task metadata
236
+ example = copy.deepcopy(task_state.metadata)
237
+ rubric_items = [RubricItem.from_dict(d) for d in example['rubrics']]
238
+ # Construct full conversation including the AI response
239
+ convo_with_response = example['prompt'] + [dict(content=original_prediction, role='assistant')]
240
+ # Format conversation as readable string
241
+ convo_str = '\n\n'.join([f"{m['role']}: {m['content']}" for m in convo_with_response])
242
+
243
+ # Evaluate response against each rubric item using LLM judge
244
+ grading_response_list = []
245
+ for rubric_item in rubric_items:
246
+ # Create judge prompt by substituting conversation and rubric item
247
+ grader_prompt = GRADER_TEMPLATE.replace('<<conversation>>',
248
+ convo_str).replace('<<rubric_item>>', str(rubric_item))
249
+ messages = [ChatMessageUser(content=grader_prompt)]
250
+ # Retry logic for robust evaluation
251
+ with retry_context(retries=3, sleep_interval=1):
252
+ grading_response = self.llm_judge.judge(messages=messages)
253
+ grading_response_dict = parse_json_to_dict(grading_response)
254
+ # Validate response format and extract boolean criteria_met field
255
+ if 'criteria_met' in grading_response_dict and isinstance(grading_response_dict['criteria_met'], bool):
256
+ grading_response_list.append(grading_response_dict)
257
+ else:
258
+ logger.warning('Grading failed due to bad JSON output, retrying...')
259
+ raise ValueError('Grading failed due to bad JSON output')
260
+
261
+ # Calculate final scores and explanations
262
+ overall_score = calculate_score(rubric_items, grading_response_list) # Overall weighted score
263
+ rubric_tag_scores, axis_grades = calculate_rubric_tag_scores(
264
+ rubric_items, grading_response_list
265
+ ) # Scores by category
266
+ readable_explanation = construct_readable_explanation(
267
+ rubric_items, grading_response_list
268
+ ) # Human-readable results
269
+
270
+ # Set score values and metadata
271
+ score.value = {
272
+ 'overall_score': overall_score,
273
+ **axis_grades, # Include axis scores at top level
274
+ }
275
+ score.main_score_name = 'overall_score'
276
+ score.metadata = {
277
+ 'readable_explanation': readable_explanation,
278
+ 'rubric_tag_scores': rubric_tag_scores,
279
+ }
280
+ # Store explanation in sample target for reference
281
+ task_state.target = '**Score Explanation**\n\n' + readable_explanation
282
+ return score
@@ -0,0 +1,102 @@
1
+ import json
2
+ import re
3
+ from collections import defaultdict
4
+
5
+ from evalscope.utils import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ def parse_json_to_dict(json_string: str) -> dict:
11
+ # Remove markdown-style ```json``` markers if present
12
+ json_cleaned = re.sub(r'^```json\s*|\s*```$', '', json_string.strip())
13
+
14
+ try:
15
+ return json.loads(json_cleaned)
16
+ except json.JSONDecodeError as e:
17
+ logger.warning(f'JSON decoding failed: {e}')
18
+ return {}
19
+
20
+
21
+ class RubricItem:
22
+
23
+ def __init__(self, criterion: str, points: float, tags: list[str]):
24
+ self.criterion = criterion
25
+ self.points = points
26
+ self.tags = tags
27
+
28
+ def __str__(self):
29
+ return f'[{self.points}] {self.criterion}'
30
+
31
+ def to_dict(self):
32
+ return {
33
+ 'criterion': self.criterion,
34
+ 'points': self.points,
35
+ 'tags': self.tags,
36
+ }
37
+
38
+ @classmethod
39
+ def from_dict(cls, d: dict):
40
+ return cls(
41
+ criterion=d['criterion'],
42
+ points=d['points'],
43
+ tags=d['tags'],
44
+ )
45
+
46
+
47
+ def calculate_score(rubric_items: list[RubricItem], grading_response_list: list[dict]) -> float | None:
48
+ total_possible_points = sum(rubric_item.points for rubric_item in rubric_items if rubric_item.points > 0)
49
+ if total_possible_points == 0:
50
+ # should not happen for overall score, but may happen for tags
51
+ return None
52
+
53
+ achieved_points = sum(
54
+ rubric_item.points
55
+ for rubric_item, grading_response in zip(rubric_items, grading_response_list, strict=True)
56
+ if grading_response['criteria_met']
57
+ )
58
+ overall_score = achieved_points / total_possible_points
59
+ return overall_score
60
+
61
+
62
+ def calculate_rubric_tag_scores(rubric_items: list[RubricItem], grading_response_list: list[dict]) -> dict[str, float]:
63
+ rubric_tag_items_grades = defaultdict(list)
64
+ axis_grades = defaultdict(list)
65
+ for rubric_item, grading_response in zip(rubric_items, grading_response_list):
66
+ curr_item_tags = set() # Ensure no duplicates in a rubric item.
67
+ for tag in rubric_item.tags:
68
+ rubric_tag_items_grades[tag].append((rubric_item, grading_response))
69
+ assert tag not in curr_item_tags
70
+ curr_item_tags.add(tag)
71
+
72
+ rubric_tag_scores = {}
73
+ for tag, items_grades in rubric_tag_items_grades.items():
74
+ items, grades = zip(*items_grades)
75
+ score = calculate_score(items, grades)
76
+ if score is not None: # implies at least one positive criterion
77
+ rubric_tag_scores[tag] = score
78
+ if tag.startswith('axis:'):
79
+ axis_grades[tag.split(':')[1]] = score
80
+
81
+ return rubric_tag_scores, axis_grades
82
+
83
+
84
+ def construct_readable_explanation(rubric_items: list[RubricItem], grading_response_list: list[dict]) -> str:
85
+ rubric_items_with_grades = []
86
+ readable_explanation_list = []
87
+ for rubric_item, grading_response in zip(rubric_items, grading_response_list):
88
+ explanation = grading_response.get('explanation', 'No explanation provided')
89
+ criteria_met = grading_response['criteria_met']
90
+ readable_explanation = (f'[{criteria_met}] {rubric_item}\n\tExplanation: {explanation}')
91
+ readable_explanation_list.append(readable_explanation)
92
+ rubric_items_with_grades.append({
93
+ **rubric_item.to_dict(),
94
+ 'criteria_met': criteria_met,
95
+ 'explanation': explanation,
96
+ })
97
+
98
+ readable_explanation_list.sort(key=lambda x: x.startswith('[False]'), reverse=True)
99
+ readable_explanation_str = '\n\n'.join(readable_explanation_list)
100
+ readable_explanation_str = f'\n\n{readable_explanation_str}'
101
+
102
+ return readable_explanation_str
@@ -3,158 +3,60 @@ import numpy as np
3
3
  import os
4
4
  import re
5
5
 
6
- from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
9
- from evalscope.models import ContinuationLogitsModelAdapter
10
- from evalscope.utils.io_utils import jsonl_to_list
6
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
11
10
  from evalscope.utils.logger import get_logger
12
- from evalscope.utils.utils import ResponseParser
11
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
13
12
 
14
13
  # flake8: noqa
15
14
 
16
15
  logger = get_logger()
17
16
 
18
17
 
19
- @Benchmark.register(
20
- name='hellaswag',
21
- dataset_id='modelscope/hellaswag',
22
- model_adapter=ContinuationLogitsModelAdapter,
23
- subset_list=['default'],
24
- metric_list=[AverageAccuracy],
25
- few_shot_num=0,
26
- train_split='train',
27
- eval_split='validation',
28
- prompt_template=
29
- 'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.', # noqa: E501
18
+ @register_benchmark(
19
+ BenchmarkMeta(
20
+ name='hellaswag',
21
+ pretty_name='HellaSwag',
22
+ tags=[Tags.COMMONSENSE, Tags.MULTIPLE_CHOICE, Tags.KNOWLEDGE],
23
+ description=
24
+ 'HellaSwag is a benchmark for commonsense reasoning in natural language understanding tasks. It consists of multiple-choice questions where the model must select the most plausible continuation of a given context.',
25
+ dataset_id='evalscope/hellaswag',
26
+ metric_list=['acc'],
27
+ subset_list=['default'],
28
+ few_shot_num=0,
29
+ train_split=None,
30
+ eval_split='validation',
31
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
32
+ )
30
33
  )
31
- class HellaSwagAdapter(DataAdapter):
32
-
33
- choices = ['0', '1', '2', '3']
34
+ class HellaSwagAdapter(MultiChoiceAdapter):
34
35
 
35
36
  def __init__(self, **kwargs):
36
-
37
- few_shot_num = kwargs.get('few_shot_num', 0)
38
- if few_shot_num != 0:
39
- logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
40
- kwargs['few_shot_num'] = 0
41
-
42
37
  super().__init__(**kwargs)
43
38
 
44
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
45
- data_dict = {}
46
- for subset_name in subset_list:
47
- data_dict[subset_name] = {}
48
- for split in [self.train_split, self.eval_split]:
49
- if os.path.exists(dataset_name_or_path):
50
- file_path = os.path.join(dataset_name_or_path, f'hellaswag_{split}.jsonl')
51
- else:
52
- file_path = os.path.join(work_dir, dataset_name_or_path, f'hellaswag_{split}.jsonl')
53
- if os.path.exists(file_path):
54
- data_dict[subset_name][split] = jsonl_to_list(file_path)
55
-
56
- return data_dict
57
-
58
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
59
- """
60
- Generate model prompt from raw data, unify the prompt format for HellaSwag benchmark.
61
-
62
- Args:
63
- input_d (dict): The raw input. A single data format of the HellaSwag:
64
-
65
- {
66
- 'ind': 4,
67
- 'activity_label': 'Removing ice from car',
68
- 'ctx_a': 'Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles.',
69
- 'ctx_b': 'then',
70
- 'ctx': 'Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles. then',
71
- 'endings': [', the man adds wax to the windshield and cuts it.', ', a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.', ', the man puts on a christmas coat, knitted with netting.', ', the man continues removing the snow on his car.'],
72
- 'source_id': 'activitynet~v_-1IBHYS3L-Y',
73
- 'split': 'train',
74
- 'split_type': 'indomain',
75
- 'label': '3'
76
- }
77
-
78
- Returns:
79
- Refer to function: evalscope.benchmarks.data_adapter.DataAdapter.gen_prompt for details.
80
- """
81
-
82
- endings: list = [self._preprocess(ending) for ending in input_d['endings']]
83
-
84
- few_shot_prompts = [
85
- self._generate_prompt(input_d=sample, endings=endings, include_answer=True) for sample in few_shot_list
86
- ]
87
- context: str = '\n'.join(few_shot_prompts) + '\n'
88
- context += self._generate_prompt(input_d=input_d, endings=endings, include_answer=False)
89
-
90
- ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
39
+ def record_to_sample(self, record) -> Sample:
40
+ # Preprocess endings
41
+ endings = [self._preprocess(ending) for ending in record['endings']]
91
42
 
92
- return {
93
- 'data': ctx_continuation_pair_list,
94
- 'multi_choices': self.choices,
95
- 'system_prompt': self.prompt_template
96
- }
43
+ # Create context
44
+ ctx = record['ctx_a'] + ' ' + record['ctx_b'].capitalize()
45
+ context = self._preprocess(ctx)
97
46
 
98
- def get_gold_answer(self, input_d: dict) -> str:
99
- # Get the gold choice
100
- return input_d['label']
47
+ # Get target choice letter
48
+ target_letter = ['A', 'B', 'C', 'D'][int(record['label'])]
101
49
 
102
- def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
103
- """
104
- Parse the model output to get the answer. Could be the best choice index.
50
+ return Sample(
51
+ input=context,
52
+ choices=endings,
53
+ target=target_letter,
54
+ metadata={'activity_label': record.get('activity_label', 'unknown')},
55
+ )
105
56
 
106
- Args:
107
- result: Predicted answer from the model. Usually a string for chat.
108
- raw_input_d: The raw input dict.
109
- eval_type: The evaluation type. e.g. checkpoint, service, custom.
110
-
111
- Returns:
112
- The parsed answer. Depending on the dataset. Usually a string for chat.
113
- """
114
- if eval_type == EvalType.CHECKPOINT:
115
- # answer: in the form of [-2.3, -4.5, ...], len of self.choices
116
- result = np.array(result)
117
- endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
118
- completion_len = np.array([float(len(i)) for i in endings])
119
- best_choice_idx = np.argmax(result / completion_len)
120
-
121
- return str(best_choice_idx)
122
- elif eval_type == EvalType.SERVICE:
123
- return ResponseParser.parse_first_option(result)
124
- elif eval_type == EvalType.CUSTOM:
125
- return ResponseParser.parse_first_option(result)
126
- else:
127
- raise ValueError(f'Invalid eval_type: {eval_type}')
128
-
129
- def match(self, gold: str, pred: str) -> float:
130
- return exact_match(gold=str(gold), pred=str(pred))
131
-
132
- @classmethod
133
- def _preprocess(cls, text):
57
+ def _preprocess(self, text):
134
58
  text = text.strip()
135
59
  text = text.replace(' [title]', '. ')
136
60
  text = re.sub('\\[.*?\\]', '', text)
137
61
  text = text.replace(' ', ' ')
138
62
  return text
139
-
140
- @classmethod
141
- def _generate_prompt(cls, input_d: dict, endings: list, include_answer=True) -> str:
142
- """
143
- Generate prompt for HellaSwag dataset.
144
-
145
- Args:
146
- input_d: a single data of the hellaswag.
147
- endings: preprocessed endings
148
- include_answer: bool
149
-
150
- Returns:
151
-
152
- """
153
-
154
- ctx = input_d['ctx_a'] + ' ' + input_d['ctx_b'].capitalize()
155
- example: str = cls._preprocess(input_d['activity_label'] + ': ' + ctx)
156
-
157
- if include_answer:
158
- example += '{}\n\n'.format(endings[int(input_d['label'])])
159
-
160
- return example
File without changes