evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,57 +1,83 @@
1
- from collections import defaultdict
2
1
  from typing import Any, Dict, List
3
2
 
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
6
- from evalscope.constants import EvalType
7
- from evalscope.metrics import Metric, mean
8
- from evalscope.models import ChatGenerationModelAdapter
9
- from evalscope.utils.utils import normalize_score
10
-
11
-
12
- @Benchmark.register(
13
- name='ifeval',
14
- dataset_id='opencompass/ifeval',
15
- model_adapter=ChatGenerationModelAdapter,
16
- subset_list=['default'],
17
- metric_list=[
18
- Metric(name='prompt_level_strict_acc', object=mean),
19
- Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
20
- Metric(name='prompt_level_loose_acc', object=mean),
21
- Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
22
- ],
23
- few_shot_num=0,
24
- train_split=None,
25
- eval_split='train',
26
- prompt_template='',
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages import ChatMessageUser
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @register_benchmark(
16
+ BenchmarkMeta(
17
+ name='ifeval',
18
+ pretty_name='IFEval',
19
+ description=
20
+ 'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
21
+ tags=[Tags.INSTRUCTION_FOLLOWING],
22
+ dataset_id='opencompass/ifeval',
23
+ subset_list=['default'],
24
+ metric_list=[
25
+ 'prompt_level_strict',
26
+ 'inst_level_strict',
27
+ 'prompt_level_loose',
28
+ 'inst_level_loose',
29
+ ],
30
+ few_shot_num=0,
31
+ train_split=None,
32
+ eval_split='train',
33
+ prompt_template='',
34
+ )
27
35
  )
28
- class IFEvalAdapter(DataAdapter):
36
+ class IFEvalAdapter(DefaultDataAdapter):
29
37
 
30
38
  def __init__(self, **kwargs):
31
39
  super().__init__(**kwargs)
32
40
 
33
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
34
- return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
41
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
42
+ """
43
+ Convert a data record to a Sample object.
44
+
45
+ Args:
46
+ record (Dict[str, Any]): Input data record.
47
+
48
+ Returns:
49
+ Sample: Sample object with input, target, and metadata.
50
+ """
51
+ prompt = record.get('prompt', '')
52
+ message_list = [ChatMessageUser(content=prompt)]
53
+
54
+ return Sample(input=message_list, target='', metadata=record)
55
+
56
+ def match_score(
57
+ self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
58
+ ) -> Score:
59
+ """
60
+ Calculate evaluation scores by comparing prediction with reference.
61
+ """
62
+ from evalscope.benchmarks.ifeval.utils import process_results
35
63
 
36
- def get_gold_answer(self, input_d: dict) -> str:
37
- return input_d
64
+ # Initialize the score object with prediction details
65
+ score = Score(
66
+ extracted_prediction=filtered_prediction,
67
+ prediction=original_prediction,
68
+ )
38
69
 
39
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
40
- return result
70
+ doc = task_state.metadata
71
+ try:
72
+ # Process results using the existing ifeval utility
73
+ results = process_results(doc, [filtered_prediction])
74
+ score.value.update(results)
41
75
 
42
- def match(self, gold: Any, pred: Any) -> Dict:
43
- return process_results(gold, [pred])
76
+ # Set main score name
77
+ score.main_score_name = 'prompt_level_strict'
44
78
 
45
- def compute_metric(self, review_res_list: List[dict]) -> Any:
46
- # aggregate review results
47
- res_dict = defaultdict(list)
48
- for res in review_res_list:
49
- for k, v in res.items():
50
- res_dict[k].append(v)
79
+ except Exception as e:
80
+ logger.error(f'Error calculating ifeval metrics: {e}')
81
+ score.value = {}
51
82
 
52
- metrics = []
53
- for metric in self.metric_list:
54
- metric_name = metric.name
55
- pred_value = res_dict[metric_name]
56
- metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
57
- return metrics
83
+ return score
@@ -15,14 +15,13 @@
15
15
 
16
16
  import collections
17
17
  import json
18
- import langdetect
19
18
  import logging
20
19
  import random
21
20
  import re
22
21
  import string
23
22
  from typing import Dict, Optional, Sequence, Union
24
23
 
25
- from evalscope.benchmarks.ifeval import instructions_util
24
+ from . import instructions_util
26
25
 
27
26
  _InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
28
27
 
@@ -141,8 +140,9 @@ class ResponseLanguageChecker(Instruction):
141
140
  if self._language is None:
142
141
  self._language = random.choice(list(_LANGUAGES.keys()))
143
142
  # TODO(tianjianlu): opens the description generation to more choices.
144
- self._description_pattern = ('Your ENTIRE response should be in {language} language, no other '
145
- + 'language is allowed.')
143
+ self._description_pattern = (
144
+ 'Your ENTIRE response should be in {language} language, no other ' + 'language is allowed.'
145
+ )
146
146
  return self._description_pattern.format(language=_LANGUAGES[self._language])
147
147
 
148
148
  def get_instruction_args(self):
@@ -163,7 +163,7 @@ class ResponseLanguageChecker(Instruction):
163
163
  True if the language of `value` follows instruction; otherwise False.
164
164
  """
165
165
  assert isinstance(value, str)
166
-
166
+ import langdetect
167
167
  try:
168
168
  return langdetect.detect(value) == self._language
169
169
  except langdetect.LangDetectException as e:
@@ -198,8 +198,10 @@ class NumberOfSentences(Instruction):
198
198
  if relation is None:
199
199
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
200
200
  elif relation not in _COMPARISON_RELATION:
201
- raise ValueError('The supported relation for comparison must be in '
202
- f'{_COMPARISON_RELATION}, but {relation} is given.')
201
+ raise ValueError(
202
+ 'The supported relation for comparison must be in '
203
+ f'{_COMPARISON_RELATION}, but {relation} is given.'
204
+ )
203
205
  else:
204
206
  self._comparison_relation = relation
205
207
 
@@ -256,8 +258,10 @@ class PlaceholderChecker(Instruction):
256
258
  self._num_placeholders = num_placeholders
257
259
  if self._num_placeholders is None or self._num_placeholders < 0:
258
260
  self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
259
- self._description_pattern = ('The response must contain at least {num_placeholders} placeholders '
260
- + 'represented by square brackets, such as [address].')
261
+ self._description_pattern = (
262
+ 'The response must contain at least {num_placeholders} placeholders '
263
+ + 'represented by square brackets, such as [address].'
264
+ )
261
265
  return self._description_pattern.format(num_placeholders=self._num_placeholders)
262
266
 
263
267
  def get_instruction_args(self):
@@ -299,9 +303,10 @@ class BulletListChecker(Instruction):
299
303
  self._num_bullets = num_bullets
300
304
  if self._num_bullets is None or self._num_bullets < 0:
301
305
  self._num_bullets = random.randint(1, _NUM_BULLETS)
302
- self._description_pattern = ('Your answer must contain exactly {num_bullets} bullet points. '
303
- + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n'
304
- + '* This is point 2')
306
+ self._description_pattern = (
307
+ 'Your answer must contain exactly {num_bullets} bullet points. '
308
+ + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n' + '* This is point 2'
309
+ )
305
310
  return self._description_pattern.format(num_bullets=self._num_bullets)
306
311
 
307
312
  def get_instruction_args(self):
@@ -380,8 +385,9 @@ class ConstrainedStartChecker(Instruction):
380
385
  self._starter = starter.strip() if isinstance(starter, str) else starter
381
386
  if self._starter is None:
382
387
  self._starter = random.choice(_STARTER_OPTIONS)
383
- self._description_pattern = ('During the conversation, when it is your turn, '
384
- + 'please always start with {starter}')
388
+ self._description_pattern = (
389
+ 'During the conversation, when it is your turn, ' + 'please always start with {starter}'
390
+ )
385
391
  return self._description_pattern.format(starter=self._starter)
386
392
 
387
393
  def get_instruction_args(self):
@@ -424,8 +430,10 @@ class HighlightSectionChecker(Instruction):
424
430
  if self._num_highlights is None or self._num_highlights < 0:
425
431
  self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
426
432
 
427
- self._description_pattern = ('Highlight at least {num_highlights} sections in your answer with '
428
- + 'markdown, i.e. *highlighted section*.')
433
+ self._description_pattern = (
434
+ 'Highlight at least {num_highlights} sections in your answer with '
435
+ + 'markdown, i.e. *highlighted section*.'
436
+ )
429
437
 
430
438
  return self._description_pattern.format(num_highlights=self._num_highlights)
431
439
 
@@ -483,9 +491,11 @@ class SectionChecker(Instruction):
483
491
  if self._num_sections is None or self._num_sections < 0:
484
492
  self._num_sections = random.randint(1, _NUM_SECTIONS)
485
493
 
486
- self._description_pattern = ('Your response must have {num_sections} sections. Mark the beginning '
487
- + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
488
- + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]')
494
+ self._description_pattern = (
495
+ 'Your response must have {num_sections} sections. Mark the beginning '
496
+ + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
497
+ + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]'
498
+ )
489
499
 
490
500
  return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
491
501
 
@@ -535,8 +545,9 @@ class ParagraphChecker(Instruction):
535
545
  if self._num_paragraphs is None or self._num_paragraphs < 0:
536
546
  self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
537
547
 
538
- self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
539
- + 'Paragraphs are separated with the markdown divider: ***')
548
+ self._description_pattern = (
549
+ 'There should be {num_paragraphs} paragraphs. ' + 'Paragraphs are separated with the markdown divider: ***'
550
+ )
540
551
 
541
552
  return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
542
553
 
@@ -586,12 +597,14 @@ class PostscriptChecker(Instruction):
586
597
  A string representing the instruction description.
587
598
  """
588
599
  self._postscript_marker = (
589
- postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker)
600
+ postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
601
+ )
590
602
  if self._postscript_marker is None:
591
603
  self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
592
604
 
593
- self._description_pattern = ('At the end of your response, please explicitly add a postscript '
594
- + 'starting with {postscript}')
605
+ self._description_pattern = (
606
+ 'At the end of your response, please explicitly add a postscript ' + 'starting with {postscript}'
607
+ )
595
608
 
596
609
  return self._description_pattern.format(postscript=self._postscript_marker)
597
610
 
@@ -645,8 +658,10 @@ class RephraseChecker(Instruction):
645
658
  'in the form of *change me*.')
646
659
 
647
660
  self._reference_without_change = original_message
648
- self._description = ('Rephrasing: Your rephrased response should only'
649
- + 'change the words/sentences in between two asterisks' + 'such as *change me*.')
661
+ self._description = (
662
+ 'Rephrasing: Your rephrased response should only' + 'change the words/sentences in between two asterisks'
663
+ + 'such as *change me*.'
664
+ )
650
665
  return self._description
651
666
 
652
667
  def get_instruction_args(self):
@@ -758,13 +773,16 @@ class KeywordFrequencyChecker(Instruction):
758
773
  if relation is None:
759
774
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
760
775
  elif relation not in _COMPARISON_RELATION:
761
- raise ValueError('The supported relation for comparison must be in '
762
- f'{_COMPARISON_RELATION}, but {relation} is given.')
776
+ raise ValueError(
777
+ 'The supported relation for comparison must be in '
778
+ f'{_COMPARISON_RELATION}, but {relation} is given.'
779
+ )
763
780
  else:
764
781
  self._comparison_relation = relation
765
782
 
766
- self._description_pattern = ('In your response, the word {keyword} should appear {relation} '
767
- + '{frequency} times.')
783
+ self._description_pattern = (
784
+ 'In your response, the word {keyword} should appear {relation} ' + '{frequency} times.'
785
+ )
768
786
 
769
787
  return self._description_pattern.format(
770
788
  keyword=self._keyword,
@@ -820,8 +838,10 @@ class NumberOfWords(Instruction):
820
838
  if relation is None:
821
839
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
822
840
  elif relation not in _COMPARISON_RELATION:
823
- raise ValueError('The supported relation for comparison must be in '
824
- f'{_COMPARISON_RELATION}, but {relation} is given.')
841
+ raise ValueError(
842
+ 'The supported relation for comparison must be in '
843
+ f'{_COMPARISON_RELATION}, but {relation} is given.'
844
+ )
825
845
  else:
826
846
  self._comparison_relation = relation
827
847
 
@@ -851,8 +871,10 @@ class JsonFormat(Instruction):
851
871
  """Check the Json format."""
852
872
 
853
873
  def build_description(self):
854
- self._description_pattern = ('Entire output should be wrapped in JSON format. You can use markdown'
855
- ' ticks such as ```.')
874
+ self._description_pattern = (
875
+ 'Entire output should be wrapped in JSON format. You can use markdown'
876
+ ' ticks such as ```.'
877
+ )
856
878
  return self._description_pattern
857
879
 
858
880
  def get_instruction_args(self):
@@ -865,8 +887,9 @@ class JsonFormat(Instruction):
865
887
 
866
888
  def check_following(self, value):
867
889
  value = (
868
- value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix(
869
- '```').removesuffix('```').strip())
890
+ value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix('```').
891
+ removesuffix('```').strip()
892
+ )
870
893
  try:
871
894
  json.loads(value)
872
895
  except ValueError:
@@ -904,10 +927,12 @@ class ParagraphFirstWordCheck(Instruction):
904
927
  self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
905
928
  self._first_word = self._first_word.lower()
906
929
 
907
- self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
908
- + 'Paragraphs and only paragraphs are separated with each other by two '
909
- + "new lines as if it was '\\n\\n' in python. "
910
- + 'Paragraph {nth_paragraph} must start with word {first_word}.')
930
+ self._description_pattern = (
931
+ 'There should be {num_paragraphs} paragraphs. '
932
+ + 'Paragraphs and only paragraphs are separated with each other by two '
933
+ + "new lines as if it was '\\n\\n' in python. "
934
+ + 'Paragraph {nth_paragraph} must start with word {first_word}.'
935
+ )
911
936
 
912
937
  return self._description_pattern.format(
913
938
  num_paragraphs=self._num_paragraphs,
@@ -1085,11 +1110,12 @@ class RephraseParagraph(Instruction):
1085
1110
  self._low = low
1086
1111
  self._high = high
1087
1112
 
1088
- self._description = ('Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
1089
- + 'between {low} and {high} of the same words. '
1090
- + 'Words are the same if and only if all of the '
1091
- + 'letters, ignoring cases, are the same. For '
1092
- + "example, 'run' is the same as 'Run' but different " + "to 'ran'.")
1113
+ self._description = (
1114
+ 'Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
1115
+ + 'between {low} and {high} of the same words. ' + 'Words are the same if and only if all of the '
1116
+ + 'letters, ignoring cases, are the same. For ' + "example, 'run' is the same as 'Run' but different "
1117
+ + "to 'ran'."
1118
+ )
1093
1119
 
1094
1120
  return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
1095
1121
 
@@ -1124,8 +1150,10 @@ class TwoResponsesChecker(Instruction):
1124
1150
 
1125
1151
  def build_description(self):
1126
1152
  """Build the instruction description."""
1127
- self._description_pattern = ('Give two different responses. Responses and only responses should'
1128
- ' be separated by 6 asterisk symbols: ******.')
1153
+ self._description_pattern = (
1154
+ 'Give two different responses. Responses and only responses should'
1155
+ ' be separated by 6 asterisk symbols: ******.'
1156
+ )
1129
1157
  return self._description_pattern
1130
1158
 
1131
1159
  def get_instruction_args(self):
@@ -1172,10 +1200,12 @@ class RepeatPromptThenAnswer(Instruction):
1172
1200
  raise ValueError('prompt_to_repeat must be set.')
1173
1201
  else:
1174
1202
  self._prompt_to_repeat = prompt_to_repeat
1175
- self._description_pattern = ('First repeat the request word for word without change,'
1176
- ' then give your answer (1. do not say any words or characters'
1177
- ' before repeating the request; 2. the request you need to repeat'
1178
- ' does not include this sentence)')
1203
+ self._description_pattern = (
1204
+ 'First repeat the request word for word without change,'
1205
+ ' then give your answer (1. do not say any words or characters'
1206
+ ' before repeating the request; 2. the request you need to repeat'
1207
+ ' does not include this sentence)'
1208
+ )
1179
1209
  return self._description_pattern
1180
1210
 
1181
1211
  def get_instruction_args(self):
@@ -1206,8 +1236,10 @@ class EndChecker(Instruction):
1206
1236
  self._end_phrase = (end_phrase.strip() if isinstance(end_phrase, str) else end_phrase)
1207
1237
  if self._end_phrase is None:
1208
1238
  self._end_phrase = random.choice(_ENDING_OPTIONS)
1209
- self._description_pattern = ('Finish your response with this exact phrase {ender}. '
1210
- 'No other words should follow this phrase.')
1239
+ self._description_pattern = (
1240
+ 'Finish your response with this exact phrase {ender}. '
1241
+ 'No other words should follow this phrase.'
1242
+ )
1211
1243
  return self._description_pattern.format(ender=self._end_phrase)
1212
1244
 
1213
1245
  def get_instruction_args(self):
@@ -1229,8 +1261,10 @@ class TitleChecker(Instruction):
1229
1261
 
1230
1262
  def build_description(self):
1231
1263
  """Build the instruction description."""
1232
- self._description_pattern = ('Your answer must contain a title, wrapped in double angular brackets,'
1233
- ' such as <<poem of joy>>.')
1264
+ self._description_pattern = (
1265
+ 'Your answer must contain a title, wrapped in double angular brackets,'
1266
+ ' such as <<poem of joy>>.'
1267
+ )
1234
1268
  return self._description_pattern
1235
1269
 
1236
1270
  def get_instruction_args(self):
@@ -1284,13 +1318,17 @@ class LetterFrequencyChecker(Instruction):
1284
1318
  if let_relation is None:
1285
1319
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
1286
1320
  elif let_relation not in _COMPARISON_RELATION:
1287
- raise ValueError('The supported relation for comparison must be in '
1288
- f'{_COMPARISON_RELATION}, but {let_relation} is given.')
1321
+ raise ValueError(
1322
+ 'The supported relation for comparison must be in '
1323
+ f'{_COMPARISON_RELATION}, but {let_relation} is given.'
1324
+ )
1289
1325
  else:
1290
1326
  self._comparison_relation = let_relation
1291
1327
 
1292
- self._description_pattern = ('In your response, the letter {letter} should appear {let_relation}'
1293
- ' {let_frequency} times.')
1328
+ self._description_pattern = (
1329
+ 'In your response, the letter {letter} should appear {let_relation}'
1330
+ ' {let_frequency} times.'
1331
+ )
1294
1332
 
1295
1333
  return self._description_pattern.format(
1296
1334
  letter=self._letter,
@@ -1339,7 +1377,7 @@ class CapitalLettersEnglishChecker(Instruction):
1339
1377
  def check_following(self, value):
1340
1378
  """Checks that the response is in English and in all capital letters."""
1341
1379
  assert isinstance(value, str)
1342
-
1380
+ import langdetect
1343
1381
  try:
1344
1382
  return value.isupper() and langdetect.detect(value) == 'en'
1345
1383
  except langdetect.LangDetectException as e:
@@ -1353,8 +1391,10 @@ class LowercaseLettersEnglishChecker(Instruction):
1353
1391
 
1354
1392
  def build_description(self):
1355
1393
  """Build the instruction description."""
1356
- self._description_pattern = ('Your entire response should be in English, and in all lowercase'
1357
- ' letters. No capital letters are allowed.')
1394
+ self._description_pattern = (
1395
+ 'Your entire response should be in English, and in all lowercase'
1396
+ ' letters. No capital letters are allowed.'
1397
+ )
1358
1398
  return self._description_pattern
1359
1399
 
1360
1400
  def get_instruction_args(self):
@@ -1367,7 +1407,7 @@ class LowercaseLettersEnglishChecker(Instruction):
1367
1407
  def check_following(self, value):
1368
1408
  """Checks that the response is in English and in all lowercase letters."""
1369
1409
  assert isinstance(value, str)
1370
-
1410
+ import langdetect
1371
1411
  try:
1372
1412
  return value.islower() and langdetect.detect(value) == 'en'
1373
1413
  except langdetect.LangDetectException as e:
@@ -1423,11 +1463,15 @@ class CapitalWordFrequencyChecker(Instruction):
1423
1463
  if capital_relation is None:
1424
1464
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
1425
1465
  elif capital_relation not in _COMPARISON_RELATION:
1426
- raise ValueError('The supported relation for comparison must be in '
1427
- f'{_COMPARISON_RELATION}, but {capital_relation} is given.')
1428
-
1429
- self._description_pattern = ('In your response, words with all capital letters should appear'
1430
- ' {relation} {frequency} times.')
1466
+ raise ValueError(
1467
+ 'The supported relation for comparison must be in '
1468
+ f'{_COMPARISON_RELATION}, but {capital_relation} is given.'
1469
+ )
1470
+
1471
+ self._description_pattern = (
1472
+ 'In your response, words with all capital letters should appear'
1473
+ ' {relation} {frequency} times.'
1474
+ )
1431
1475
 
1432
1476
  return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
1433
1477
 
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
  """Registry of all instructions."""
15
15
 
16
- from evalscope.benchmarks.ifeval import instructions
16
+ from . import instructions
17
17
 
18
18
  _KEYWORD = 'keywords:'
19
19
 
@@ -14,7 +14,6 @@
14
14
  """Utility library of instructions."""
15
15
 
16
16
  import functools
17
- import immutabledict
18
17
  import nltk
19
18
  import os
20
19
  import random
@@ -1551,7 +1550,7 @@ WORD_LIST = [
1551
1550
  ] # pylint: disable=line-too-long
1552
1551
 
1553
1552
  # ISO 639-1 codes to language names.
1554
- LANGUAGE_CODES = immutabledict.immutabledict({
1553
+ LANGUAGE_CODES = {
1555
1554
  'en': 'English',
1556
1555
  'es': 'Spanish',
1557
1556
  'pt': 'Portuguese',
@@ -1582,7 +1581,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({
1582
1581
  'pa': 'Punjabi',
1583
1582
  'ml': 'Malayalam',
1584
1583
  'fi': 'Finnish',
1585
- })
1584
+ }
1586
1585
 
1587
1586
  _ALPHABETS = '([A-Za-z])'
1588
1587
  _PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
@@ -1,7 +1,7 @@
1
1
  import dataclasses
2
2
  from typing import Dict, Optional, Union
3
3
 
4
- from evalscope.benchmarks.ifeval import instructions_registry
4
+ from . import instructions_registry
5
5
 
6
6
 
7
7
  @dataclasses.dataclass
@@ -121,14 +121,13 @@ def process_results(doc, results):
121
121
  out_loose = test_instruction_following_loose(inp, response)
122
122
 
123
123
  return {
124
- 'prompt_level_strict_acc': out_strict.follow_all_instructions,
125
- 'inst_level_strict_acc': out_strict.follow_instruction_list,
126
- 'prompt_level_loose_acc': out_loose.follow_all_instructions,
127
- 'inst_level_loose_acc': out_loose.follow_instruction_list,
124
+ 'prompt_level_strict': float(out_strict.follow_all_instructions),
125
+ 'inst_level_strict': agg_inst_level_acc(out_strict.follow_instruction_list),
126
+ 'prompt_level_loose': float(out_loose.follow_all_instructions),
127
+ 'inst_level_loose': agg_inst_level_acc(out_loose.follow_instruction_list),
128
128
  }
129
129
 
130
130
 
131
131
  def agg_inst_level_acc(items):
132
- flat_items = [item for sublist in items for item in sublist]
133
- inst_level_acc = sum(flat_items) / len(flat_items)
132
+ inst_level_acc = sum(items) / len(items) if items else 0
134
133
  return inst_level_acc
File without changes
File without changes