evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,170 @@
1
+ # flake8: noqa: E501
2
+
3
+ from typing import Any, Dict, List
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentText
8
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ DESCRIPTION = (
14
+ 'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
15
+ 'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
16
+ 'or rhetorically subversive.'
17
+ )
18
+
19
+ PROMPT_TEMPLATE = """
20
+ #Instruction#:
21
+ Classify whether the given text is a Drivelology sample or not.
22
+
23
+ #Definition#:
24
+ - Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
25
+ These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
26
+ often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
27
+ emotional insight to unravel their true significance.
28
+ - non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
29
+ statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
30
+ or proverbs, that convey clear or straightforward information without the layered complexity
31
+ characteristic of Drivelology.
32
+
33
+ #Output Format#:
34
+ You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
35
+ The answer you give MUST be \"Yes\" or \"No\"".
36
+
37
+ #Input Text#: {text}
38
+ #Your Answer#:
39
+ """.strip() # noqa: E501
40
+
41
+ FEWSHOT_PROMPT_TEMPLATE = """
42
+ #Instruction#:
43
+ Classify whether the given text is a Drivelology sample or not.
44
+
45
+ #Definition#:
46
+ - Drivelology: Statements that appear logically coherent but contain deeper, often paradoxical meanings.
47
+ These challenge conventional interpretation by blending surface-level nonsense with underlying depth,
48
+ often incorporating elements of humor, irony, or sarcasm, and requiring contextual understanding and
49
+ emotional insight to unravel their true significance.
50
+ - non-Drivelology: This includes pure nonsense (grammatically correct but semantically meaningless
51
+ statements, such as "Colourless green ideas sleep furiously") and normal sentences, including quotes
52
+ or proverbs, that convey clear or straightforward information without the layered complexity
53
+ characteristic of Drivelology.
54
+
55
+ #Output Format#:
56
+ You should try your best to answer "Yes" if the given input text is Drivelology, otherwise specify "No".
57
+ The answer you give MUST be \"Yes\" or \"No\"".
58
+
59
+ Here are some examples of how to solve similar problems:
60
+
61
+ #Input Text#: Saw a book called "how to solve 50 percent of your problems" so I bought 2 books.
62
+ #Your Answer#: Yes
63
+
64
+ #Input Text#: Colourless green ideas sleep furiously.
65
+ #Your Answer#: No
66
+
67
+ #Input Text#: I went to a restaurant, and saw this guy was choking. I gotta save him. And then I realized he was just speaking French.
68
+ #Your Answer#: Yes
69
+
70
+ #Input Text#: Either it is or it isn't.
71
+ #Your Answer#: No
72
+
73
+ #Input Text#: {text}
74
+ #Your Answer#:
75
+ """.strip() # noqa: E501
76
+
77
+ logger = get_logger()
78
+
79
+
80
+ @register_benchmark(
81
+ BenchmarkMeta(
82
+ name='drivel_binary',
83
+ pretty_name='DrivelologyBinaryClassification',
84
+ tags=[Tags.YES_NO],
85
+ description=DESCRIPTION.strip(),
86
+ dataset_id='extraordinarylab/drivel-hub',
87
+ subset_list=['binary-classification'],
88
+ metric_list=['accuracy', 'precision', 'recall', 'f1_score', 'yes_ratio'],
89
+ aggregation='f1',
90
+ few_shot_num=0,
91
+ eval_split='test',
92
+ prompt_template='{question}',
93
+ few_shot_prompt_template='{question}'
94
+ )
95
+ )
96
+ class DrivelologyBinaryClassificationAdapter(DefaultDataAdapter):
97
+
98
+ def __init__(self, **kwargs):
99
+ super().__init__(**kwargs)
100
+ self.add_overall_metric = False
101
+ if self.few_shot_num not in [0, 4]:
102
+ logger.warning(f'For DrivelologyBinaryClassification, use 4-shot by default.')
103
+ self.few_shot_num = 4
104
+
105
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
106
+ if self.few_shot_num > 0:
107
+ prompt = FEWSHOT_PROMPT_TEMPLATE.format(text=record['text'])
108
+ else:
109
+ prompt = PROMPT_TEMPLATE.format(text=record['text'])
110
+ content_list: List[Content] = [ContentText(text=prompt)]
111
+ answer = 'YES' if str(record['label']) == 'drivelology' else 'NO' # 'YES' or 'NO'
112
+ return Sample(input=[ChatMessageUser(content=content_list)], target=answer, metadata={
113
+ 'answer': answer,
114
+ })
115
+
116
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
117
+ score = Score(
118
+ extracted_prediction=filtered_prediction,
119
+ prediction=original_prediction,
120
+ )
121
+ # Check if the reference answer is in the filtered prediction
122
+ result = 1 if reference in filtered_prediction.strip().upper() else 0
123
+ score.value = {'acc': result}
124
+ return score
125
+
126
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
127
+ """
128
+ Custom aggregation to compute accuracy, precision, recall, f1_score, and yes_ratio.
129
+ """
130
+
131
+ def compute_metrics(scores: List[SampleScore]):
132
+ tp = fp = tn = fn = 0
133
+ yes_count = 0
134
+ total_count = len(scores)
135
+
136
+ for ss in scores:
137
+ gt = ss.sample_metadata['answer'].strip().upper()
138
+ # Get prediction based on score
139
+ pred = gt if ss.score.main_value == 1 else ('NO' if gt == 'YES' else 'YES')
140
+ if pred == 'YES':
141
+ yes_count += 1
142
+ if pred == 'YES' and gt == 'YES':
143
+ tp += 1
144
+ elif pred == 'YES' and gt == 'NO':
145
+ fp += 1
146
+ elif pred == 'NO' and gt == 'NO':
147
+ tn += 1
148
+ elif pred == 'NO' and gt == 'YES':
149
+ fn += 1
150
+
151
+ accuracy = (tp + tn) / total_count if total_count > 0 else 0.0
152
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
153
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
154
+ f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
155
+ yes_ratio = yes_count / total_count if total_count > 0 else 0.0
156
+
157
+ return {
158
+ 'accuracy': accuracy,
159
+ 'precision': precision,
160
+ 'recall': recall,
161
+ 'f1_score': f1_score,
162
+ 'yes_ratio': yes_ratio
163
+ }
164
+
165
+ overall_metrics = compute_metrics(sample_scores)
166
+ agg_scores = []
167
+ for metric_name, value in overall_metrics.items():
168
+ agg_scores.append(AggScore(metric_name=metric_name, score=value, num=len(sample_scores), metadata={}))
169
+
170
+ return agg_scores
@@ -0,0 +1,254 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, ContentText
8
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.logger import get_logger
12
+ from evalscope.utils.multi_choices import parse_answers, prompt
13
+
14
+ logger = get_logger()
15
+
16
+ DESCRIPTION = (
17
+ 'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
18
+ 'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
19
+ 'or rhetorically subversive.'
20
+ )
21
+
22
+ MULTIPLE_ANSWER_TEMPLATE = r"""
23
+ #Instruction#:
24
+ Classify the given text into one or more of the following categories: inversion, wordplay, switchbait, paradox, and misdirection.
25
+
26
+ #Definitions#:
27
+ - inversion: This technique takes a well-known phrase, cliché, or social script and flips it on its head. The humour arises by reversing a familiar structure to creating a new, often satirical, meaning.
28
+ - wordplay: This is the use of linguistic creativity, often by exploiting the phonetics or polysemy of words. It includes puns, double entendres, and similarities.
29
+ - switchbait: This technique hinges on a specific phrase (the "bait") that has a culturally-embedded double meaning. The initial context is then suddenly replaced (the "switch") by a surprising second meaning. The humour is generated by this cynical or culturally-specific reinterpretation of the bait, rather than by derailing a narrative.
30
+ - paradox: This relies on a statement that appears logically self-contradictory but contains a latent, often humorous or profound truth. The core of the technique is the clash of seemingly incompatible ideas.
31
+ - misdirection: This technique leads the listener down an expected path before a final twist reveals a different, often more literal or absurd, ending.
32
+
33
+ Answer the following multiple choice question where multiple answers may be correct.
34
+ The entire content of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}.
35
+
36
+ {question}
37
+
38
+ {choices}
39
+ """.strip() # noqa: E501
40
+
41
+
42
+ @register_benchmark(
43
+ BenchmarkMeta(
44
+ name='drivel_multilabel',
45
+ pretty_name='DrivelologyMultilabelClassification',
46
+ tags=[Tags.MULTIPLE_CHOICE],
47
+ description=DESCRIPTION.strip(),
48
+ dataset_id='extraordinarylab/drivel-hub',
49
+ subset_list=['multi-label-classification'],
50
+ metric_list=['f1_weighted', 'f1_micro', 'f1_macro', 'exact_match'],
51
+ aggregation='f1_weighted',
52
+ eval_split='test',
53
+ prompt_template='{question}',
54
+ )
55
+ )
56
+ class DrivelologyMultilabelClassificationAdapter(DefaultDataAdapter):
57
+
58
+ def __init__(self, *args, **kwargs):
59
+ super().__init__(*args, **kwargs)
60
+ self.categories = ['inversion', 'wordplay', 'switchbait', 'paradox', 'misdirection']
61
+ self.choices = {'A': 'inversion', 'B': 'wordplay', 'C': 'switchbait', 'D': 'paradox', 'E': 'misdirection'}
62
+ self.categories_to_letters = {v: k for k, v in self.choices.items()}
63
+
64
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
65
+ text: str = record['text']
66
+ label: List[str] = record['label']
67
+ question = f'Text to classify: {text}'
68
+ choices_list = [f'{key}. {value}' for key, value in self.choices.items()]
69
+ input_text = prompt(question=question, choices=choices_list, template=MULTIPLE_ANSWER_TEMPLATE)
70
+ content_list = [ContentText(text=input_text)]
71
+ target_letters = ''.join(
72
+ sorted([self.categories_to_letters[cat] for cat in label if cat in self.categories_to_letters])
73
+ )
74
+ metadata = {'text': text, 'label': label, 'target_letters': target_letters}
75
+ return Sample(
76
+ input=[ChatMessageUser(content=content_list)],
77
+ choices=choices_list,
78
+ target=target_letters,
79
+ metadata=metadata,
80
+ )
81
+
82
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
83
+ pattern = r'ANSWER:\s*([A-E]+)'
84
+ match = re.search(pattern, prediction)
85
+ if match:
86
+ letters = match.group(1).strip().upper()
87
+ return ''.join(sorted(set(letters)))
88
+ else:
89
+ try:
90
+ answers = parse_answers(prediction)
91
+ return ''.join(sorted(list(answers)))
92
+ except Exception as e:
93
+ logger.warning(f'Could not extract answer from: {prediction}. Error: {e}')
94
+ return ''
95
+
96
+ def match_score(
97
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
98
+ ) -> Score:
99
+ """
100
+ Calculate the match score between the prediction and reference for multilabel classification.
101
+
102
+ Args:
103
+ original_prediction: The original model output
104
+ filtered_prediction: The extracted answer (letter format, e.g., "AC")
105
+ reference: The reference answer (letter format, e.g., "AC")
106
+ task_state: The current task state
107
+
108
+ Returns:
109
+ Score object with metrics
110
+ """
111
+ # Create a Score object as required by the API
112
+ score = Score(
113
+ extracted_prediction=filtered_prediction,
114
+ prediction=original_prediction,
115
+ )
116
+
117
+ # Convert letter answers to category sets
118
+ pred_categories = set(self.choices.get(letter, '') for letter in filtered_prediction)
119
+ target_categories = set(self.choices.get(letter, '') for letter in reference)
120
+
121
+ # Remove empty strings (may be caused by invalid letters)
122
+ pred_categories = {cat for cat in pred_categories if cat}
123
+ target_categories = {cat for cat in target_categories if cat}
124
+
125
+ # Calculate TP (true positives), FP (false positives), and FN (false negatives)
126
+ tp = len(pred_categories & target_categories) # intersection
127
+ fp = len(pred_categories - target_categories) # in prediction but not in target
128
+ fn = len(target_categories - pred_categories) # in target but not in prediction
129
+
130
+ # Calculate precision, recall and F1 score
131
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
132
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
133
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
134
+
135
+ # Calculate exact match (1.0 if prediction exactly matches target)
136
+ exact_match = 1.0 if pred_categories == target_categories else 0.0
137
+
138
+ # Store category information in metadata for later aggregation
139
+ category_data = {}
140
+ for cat in self.categories:
141
+ in_pred = cat in pred_categories
142
+ in_target = cat in target_categories
143
+
144
+ category_data[cat] = {
145
+ 'tp': 1 if in_pred and in_target else 0,
146
+ 'fp': 1 if in_pred and not in_target else 0,
147
+ 'fn': 1 if not in_pred and in_target else 0,
148
+ 'support': 1 if in_target else 0
149
+ }
150
+
151
+ # Set simple numerical values in score.value as expected by the API
152
+ score.value = {'f1': f1, 'precision': precision, 'recall': recall, 'exact_match': exact_match}
153
+
154
+ # Store category data in metadata for aggregation
155
+ score.metadata = {'category_data': category_data}
156
+
157
+ return score
158
+
159
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
160
+ """
161
+ Aggregate scores across all samples.
162
+ Computes weighted, macro, and micro F1 scores for multilabel classification.
163
+
164
+ Args:
165
+ sample_scores: List of sample scores
166
+
167
+ Returns:
168
+ List of aggregated scores
169
+ """
170
+ if not sample_scores:
171
+ return [
172
+ AggScore(metric_name='f1_weighted', score=0.0, num=0, metadata={}),
173
+ AggScore(metric_name='f1_micro', score=0.0, num=0, metadata={}),
174
+ AggScore(metric_name='f1_macro', score=0.0, num=0, metadata={}),
175
+ AggScore(metric_name='exact_match', score=0.0, num=0, metadata={})
176
+ ]
177
+
178
+ # Initialize category statistics
179
+ category_stats = {cat: {'tp': 0, 'fp': 0, 'fn': 0, 'support': 0} for cat in self.categories}
180
+ total_exact_matches = 0
181
+ num_samples = len(sample_scores)
182
+
183
+ # Aggregate statistics across all samples
184
+ for ss in sample_scores:
185
+ # Add exact match score to total
186
+ total_exact_matches += ss.score.value.get('exact_match', 0)
187
+
188
+ # Get category data from metadata
189
+ if 'category_data' in ss.score.metadata:
190
+ cat_data = ss.score.metadata['category_data']
191
+ for cat, stats in cat_data.items():
192
+ if cat in self.categories:
193
+ category_stats[cat]['tp'] += stats.get('tp', 0)
194
+ category_stats[cat]['fp'] += stats.get('fp', 0)
195
+ category_stats[cat]['fn'] += stats.get('fn', 0)
196
+ category_stats[cat]['support'] += stats.get('support', 0)
197
+
198
+ # Calculate F1 scores for each category
199
+ category_f1 = {}
200
+ total_support = sum(stats['support'] for stats in category_stats.values())
201
+ f1_sum = 0.0
202
+
203
+ for cat, stats in category_stats.items():
204
+ tp = stats['tp']
205
+ fp = stats['fp']
206
+ fn = stats['fn']
207
+
208
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
209
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
210
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
211
+
212
+ category_f1[cat] = f1
213
+ f1_sum += f1
214
+
215
+ # Calculate micro-average F1 (based on aggregate TP, FP, FN)
216
+ total_tp = sum(stats['tp'] for stats in category_stats.values())
217
+ total_fp = sum(stats['fp'] for stats in category_stats.values())
218
+ total_fn = sum(stats['fn'] for stats in category_stats.values())
219
+
220
+ micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
221
+ micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
222
+ f1_micro = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (
223
+ micro_precision + micro_recall
224
+ ) > 0 else 0.0
225
+
226
+ # Calculate macro-average F1 (simple average of category F1 scores)
227
+ f1_macro = f1_sum / len(self.categories) if self.categories else 0.0
228
+
229
+ # Calculate weighted-average F1 (weighted by support)
230
+ f1_weighted = 0.0
231
+ if total_support > 0:
232
+ for cat, stats in category_stats.items():
233
+ cat_f1 = category_f1[cat]
234
+ weight = stats['support'] / total_support
235
+ f1_weighted += cat_f1 * weight
236
+
237
+ # Calculate accuracy (proportion of exact matches)
238
+ exact_match = total_exact_matches / num_samples
239
+
240
+ # Return list of aggregate scores
241
+ return [
242
+ AggScore(
243
+ metric_name='f1_weighted',
244
+ score=f1_weighted,
245
+ num=num_samples,
246
+ metadata={'category_f1': {
247
+ cat: f1
248
+ for cat, f1 in category_f1.items()
249
+ }}
250
+ ),
251
+ AggScore(metric_name='f1_micro', score=f1_micro, num=num_samples, metadata={}),
252
+ AggScore(metric_name='f1_macro', score=f1_macro, num=num_samples, metadata={}),
253
+ AggScore(metric_name='exact_match', score=exact_match, num=num_samples, metadata={})
254
+ ]
@@ -0,0 +1,49 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
2
+ from evalscope.api.dataset import Sample
3
+ from evalscope.api.registry import register_benchmark
4
+ from evalscope.constants import Tags
5
+
6
+ DESCRIPTION = (
7
+ 'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
8
+ 'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
9
+ 'or rhetorically subversive.'
10
+ )
11
+
12
+ PROMPT_TEMPLATE = r"""
13
+ Tell me the best option in the following options which represents the underlying narrative of the text?
14
+ The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
15
+
16
+ {question}
17
+
18
+ {choices}
19
+ """.strip() # noqa: E501
20
+
21
+
22
+ @register_benchmark(
23
+ BenchmarkMeta(
24
+ name='drivel_selection',
25
+ pretty_name='DrivelologyNarrativeSelection',
26
+ tags=[Tags.MULTIPLE_CHOICE],
27
+ description=DESCRIPTION.strip(),
28
+ dataset_id='extraordinarylab/drivel-hub',
29
+ subset_list=['multiple-choice-english-easy', 'multiple-choice-english-hard'],
30
+ metric_list=['acc'],
31
+ few_shot_num=0,
32
+ train_split=None,
33
+ eval_split='test',
34
+ prompt_template=PROMPT_TEMPLATE,
35
+ )
36
+ )
37
+ class DrivelologyNarrativeSelectionAdapter(MultiChoiceAdapter):
38
+
39
+ def __init__(self, **kwargs):
40
+ super().__init__(**kwargs)
41
+ self.add_overall_metric = False
42
+
43
+ def record_to_sample(self, record) -> Sample:
44
+ return Sample(
45
+ input=record['text'],
46
+ choices=record['choices'],
47
+ target=record['answer'],
48
+ metadata={},
49
+ )
@@ -0,0 +1,218 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, ContentText
8
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ DESCRIPTION = (
16
+ 'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
17
+ 'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
18
+ 'or rhetorically subversive.'
19
+ )
20
+
21
+ # Keep the original generation and evaluation templates
22
+ NARRATIVE_GENERATION_TEMPLATE = """
23
+ You need to first read and understand the text given. Generate a detailed description to illustrate the implicit narrative of the text.
24
+
25
+ Please provide your response in English, with a clear and comprehensive explanation of the narrative.
26
+
27
+ Text: {text}
28
+ """.strip() # noqa: E501
29
+
30
+ NARRATIVE_EVALUATION_TEMPLATE = """
31
+ Please act as an impartial judge and evaluate how accurately the candidate narrative matches the given reference narrative.
32
+ Your evaluation should consider factors such as the relevance, accuracy, depth, and level of detail of the candidate narrative compared to the reference.
33
+
34
+ Begin your evaluation by providing a short explanation in English. Be as objective as possible.
35
+
36
+ After providing your explanation, you must rate the match on a Likert scale from 1 to 5, where:
37
+ 1 = Very poor match
38
+ 2 = Poor match
39
+ 3 = Moderate match
40
+ 4 = Good match
41
+ 5 = Excellent match
42
+
43
+ Please format your rating strictly as: "Rating: [[X]]" where X is a whole number from 1 to 5.
44
+
45
+ [Candidate Narrative]
46
+ {candidate}
47
+
48
+ [Reference Narrative]
49
+ {reference}
50
+ """.strip() # noqa: E501
51
+
52
+
53
+ @register_benchmark(
54
+ BenchmarkMeta(
55
+ name='drivel_writing',
56
+ pretty_name='DrivelologyNarrativeWriting',
57
+ tags=[Tags.KNOWLEDGE, Tags.REASONING],
58
+ description=DESCRIPTION.strip(),
59
+ dataset_id='extraordinarylab/drivel-hub',
60
+ subset_list=['narrative-writing-english'],
61
+ metric_list={
62
+ 'bert_score': {
63
+ 'model_id_or_path': 'AI-ModelScope/roberta-large',
64
+ 'model_type': 'roberta-large'
65
+ },
66
+ 'gpt_score': {}
67
+ },
68
+ few_shot_num=0,
69
+ train_split=None,
70
+ eval_split='test',
71
+ prompt_template=NARRATIVE_GENERATION_TEMPLATE
72
+ )
73
+ )
74
+ class DrivelologyNarrativeWritingAdapter(DefaultDataAdapter):
75
+
76
+ def __init__(self, *args, **kwargs):
77
+ super().__init__(*args, **kwargs)
78
+ self._use_llm_judge = True # Use LLM as a judge by default
79
+ self.use_batch_scoring = True # Enable batch scoring
80
+
81
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
82
+ """
83
+ Convert a data record to a Sample object.
84
+ """
85
+ text = record['text']
86
+ reference_narrative = record['narrative']
87
+
88
+ # Format the generation prompt with the text
89
+ input_prompt = NARRATIVE_GENERATION_TEMPLATE.format(text=text)
90
+
91
+ # Create content list for the input
92
+ content_list = [ContentText(text=input_prompt)]
93
+
94
+ return Sample(
95
+ input=[ChatMessageUser(content=content_list)],
96
+ target=reference_narrative,
97
+ metadata={
98
+ 'text': text,
99
+ 'reference_narrative': reference_narrative
100
+ }
101
+ )
102
+
103
+ def batch_match_score(self, original_predictions, filtered_predictions, references, task_states):
104
+ """
105
+ Batch calculate the match scores using BERTScore.
106
+ """
107
+ from evalscope.metrics.metric import BertScore
108
+
109
+ score_args = self.metric_list.get('bert_score', {})
110
+ bert_scorer = BertScore(**score_args)
111
+ bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
112
+ scores = []
113
+ for i in range(len(original_predictions)):
114
+ score = Score(
115
+ extracted_prediction=filtered_predictions[i],
116
+ prediction=original_predictions[i],
117
+ value={'bert_score': bert_score_f1[i]}
118
+ )
119
+ scores.append(score)
120
+ return scores
121
+
122
+ def llm_match_score(
123
+ self,
124
+ original_prediction: str,
125
+ filtered_prediction: str,
126
+ reference: str,
127
+ task_state: TaskState,
128
+ ) -> Score:
129
+ """
130
+ Calculate the match score using LLM judge and BERTScore.
131
+ """
132
+ score = Score(
133
+ extracted_prediction=filtered_prediction,
134
+ prediction=original_prediction,
135
+ )
136
+
137
+ # Initialize score value dictionary
138
+ score.value = {}
139
+
140
+ # Use LLM judge to evaluate narrative quality
141
+ eval_prompt = NARRATIVE_EVALUATION_TEMPLATE.format(candidate=filtered_prediction, reference=reference)
142
+
143
+ judge_response = self.llm_judge.judge(eval_prompt)
144
+ logger.info(f'LLM judge response received (first 100 chars): {judge_response[:100]}...')
145
+
146
+ # Extract rating using regex pattern
147
+ match = re.search(r'Rating:\s*\[\[([1-5])\]\]', judge_response)
148
+ if match:
149
+ rating = int(match.group(1))
150
+ gpt_score = (rating - 1) / 4.0 # Normalize to 0-1 scale
151
+ logger.info(f'Rating extracted: {rating}/5 -> {gpt_score}')
152
+ else:
153
+ # Try alternative pattern
154
+ alt_match = re.search(r'(\[\[|\[)([1-5])(\]\]|\])', judge_response)
155
+ if alt_match:
156
+ rating = int(alt_match.group(2))
157
+ gpt_score = (rating - 1) / 4.0
158
+ logger.info(f'Rating extracted (alt pattern): {rating}/5 -> {gpt_score}')
159
+ else:
160
+ # Last resort: standalone digit
161
+ number_match = re.search(r'(?<!\d)[1-5](?!\d)', judge_response)
162
+ if number_match:
163
+ rating = int(number_match.group(0))
164
+ gpt_score = (rating - 1) / 4.0
165
+ logger.info(f'Rating extracted (fallback): {rating}/5 -> {gpt_score}')
166
+ else:
167
+ gpt_score = 0.0
168
+ logger.warning('No rating found in response, using default 0.0')
169
+
170
+ score.value['gpt_score'] = gpt_score
171
+ score.explanation = f'LLM judge rating: {gpt_score:.2f}'
172
+
173
+ score.metadata = {
174
+ 'judge_response': judge_response[:300],
175
+ 'model': getattr(self.llm_judge, 'model_id', 'unknown')
176
+ }
177
+
178
+ score.main_score_name = 'gpt_score'
179
+ return score
180
+
181
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
182
+ """
183
+ Aggregate scores across all samples.
184
+ """
185
+ if not sample_scores:
186
+ return [
187
+ AggScore(metric_name='gpt_score', score=0.0, num=0, metadata={}),
188
+ AggScore(metric_name='bert_score', score=0.0, num=0, metadata={})
189
+ ]
190
+
191
+ # Extract scores
192
+ gpt_scores = [ss.score.value.get('gpt_score', 0.0) for ss in sample_scores]
193
+ bert_scores = [ss.score.value.get('bert_score', 0.0) for ss in sample_scores]
194
+
195
+ # Calculate averages
196
+ avg_gpt_score = sum(gpt_scores) / len(gpt_scores) if gpt_scores else 0.0
197
+ avg_bert_score = sum(bert_scores) / len(bert_scores) if bert_scores else 0.0
198
+
199
+ return [
200
+ AggScore(
201
+ metric_name='gpt_score',
202
+ score=avg_gpt_score,
203
+ num=len(sample_scores),
204
+ metadata={
205
+ 'min_score': min(gpt_scores),
206
+ 'max_score': max(gpt_scores)
207
+ }
208
+ ),
209
+ AggScore(
210
+ metric_name='bert_score',
211
+ score=avg_bert_score,
212
+ num=len(sample_scores),
213
+ metadata={
214
+ 'min_score': min(bert_scores),
215
+ 'max_score': max(bert_scores)
216
+ }
217
+ )
218
+ ]