evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,389 @@
1
+ import os
2
+ from itertools import product
3
+ from tqdm import tqdm
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Union
5
+
6
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
+ from evalscope.api.dataset import DatasetDict, DictDataLoader, MemoryDataset, Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.metric import Score
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ if TYPE_CHECKING:
15
+ from evalscope.report import Report
16
+
17
+ logger = get_logger()
18
+
19
+ PROMPT_TEMPLATE = """Please read the following text and answer the question below.
20
+
21
+ <text>
22
+ {context}
23
+ </text>
24
+
25
+ <question>
26
+ {question}
27
+ </question>
28
+
29
+ Don't give information outside the document or repeat your findings."""
30
+
31
+
32
+ @register_benchmark(
33
+ BenchmarkMeta(
34
+ name='needle_haystack',
35
+ pretty_name='Needle-in-a-Haystack',
36
+ tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
37
+ description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
38
+ 'It requires the model to find specific information within a large corpus of text. '
39
+ '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html)', # noqa: E501
40
+ dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
41
+ metric_list=['acc'],
42
+ subset_list=['english', 'chinese'],
43
+ eval_split='test',
44
+ system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
45
+ prompt_template=PROMPT_TEMPLATE,
46
+ extra_params={
47
+ 'retrieval_question':
48
+ 'What is the best thing to do in San Francisco?',
49
+ 'needles':
50
+ ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
51
+ 'context_lengths_min':
52
+ 1000,
53
+ 'context_lengths_max':
54
+ 32000,
55
+ 'context_lengths_num_intervals':
56
+ 10,
57
+ 'document_depth_percent_min':
58
+ 0,
59
+ 'document_depth_percent_max':
60
+ 100,
61
+ 'document_depth_percent_intervals':
62
+ 10,
63
+ 'tokenizer_path':
64
+ 'Qwen/Qwen3-0.6B',
65
+ 'show_score':
66
+ False,
67
+ }
68
+ )
69
+ )
70
+ class NeedleHaystackAdapter(DefaultDataAdapter):
71
+
72
+ def __init__(self, **kwargs):
73
+ super().__init__(**kwargs)
74
+
75
+ self._use_llm_judge = True
76
+ self.add_aggregation_name = False # Don't add aggregation name for needle haystack adapter
77
+ # set extra params
78
+ self.retrieval_question = self.extra_params.get(
79
+ 'retrieval_question', 'What is the best thing to do in San Francisco?'
80
+ )
81
+ self.needles = self.extra_params.get(
82
+ 'needles',
83
+ ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n']
84
+ )
85
+ self.context_lengths_min = self.extra_params.get('context_lengths_min', 1000)
86
+ self.context_lengths_max = self.extra_params.get('context_lengths_max', 32000)
87
+ self.context_lengths_num_intervals = self.extra_params.get('context_lengths_num_intervals', 10)
88
+ self.document_depth_percent_min = self.extra_params.get('document_depth_percent_min', 0)
89
+ self.document_depth_percent_max = self.extra_params.get('document_depth_percent_max', 100)
90
+ self.document_depth_percent_intervals = self.extra_params.get('document_depth_percent_intervals', 10)
91
+ self.tokenizer_path = self.extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
92
+ self.show_score = self.extra_params.get('show_score', False)
93
+
94
+ self._init_tokenizer()
95
+ self._init_length()
96
+
97
+ def _init_length(self):
98
+ """ Initialize context lengths and document depth percentages based on the provided parameters."""
99
+ import numpy as np
100
+
101
+ self.context_lengths = np.round(
102
+ np.linspace(
103
+ self.context_lengths_min,
104
+ self.context_lengths_max,
105
+ num=self.context_lengths_num_intervals,
106
+ endpoint=True
107
+ )
108
+ ).astype(int)
109
+
110
+ self.document_depth_percents = np.round(
111
+ np.linspace(
112
+ self.document_depth_percent_min,
113
+ self.document_depth_percent_max,
114
+ num=self.document_depth_percent_intervals,
115
+ endpoint=True
116
+ )
117
+ ).astype(int)
118
+
119
+ def _init_tokenizer(self):
120
+ """ Initialize the tokenizer based on the provided tokenizer path."""
121
+ from modelscope import AutoTokenizer
122
+ self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
123
+
124
+ def load(self):
125
+ """Load dataset from local disk or remote."""
126
+ dataset_name_or_path = self.dataset_id
127
+ if os.path.exists(dataset_name_or_path):
128
+ logger.info(f'Loading dataset from {dataset_name_or_path}')
129
+ dataset_path = dataset_name_or_path
130
+ else:
131
+ from modelscope import dataset_snapshot_download
132
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
133
+ dataset_path = dataset_snapshot_download(
134
+ dataset_name_or_path, allow_file_pattern=['PaulGraham_Essays.txt', 'Journey_to_the_West.txt']
135
+ )
136
+
137
+ # Load datasets for both subsets
138
+ datasets = {}
139
+ file_structure = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
140
+
141
+ for subset_name, files in file_structure.items():
142
+ if subset_name not in self.subset_list:
143
+ continue
144
+ file_path = os.path.join(dataset_path, files[0])
145
+ if os.path.exists(file_path):
146
+ with open(file_path, 'r', encoding='utf-8') as f:
147
+ text = f.read()
148
+
149
+ # Generate samples for all combinations of context length and depth
150
+ records = []
151
+ tokens_context = self._get_context_tokens(text)
152
+ for context_length, depth_percent in tqdm(
153
+ product(self.context_lengths, self.document_depth_percents),
154
+ desc=f'Generating {subset_name} samples'
155
+ ):
156
+ context = self._insert_needles(tokens_context, depth_percent, context_length)
157
+ record = {
158
+ 'text': text,
159
+ 'context_length': int(context_length),
160
+ 'depth_percent': int(depth_percent),
161
+ 'question': self.retrieval_question,
162
+ 'answer': '\n'.join(self.needles),
163
+ 'context': context,
164
+ }
165
+ records.append(record)
166
+
167
+ dataset = DictDataLoader(
168
+ dict_list=records,
169
+ limit=self.limit,
170
+ repeats=self.repeats,
171
+ sample_fields=self.record_to_sample,
172
+ shuffle=self.shuffle,
173
+ ).load()
174
+
175
+ datasets[subset_name] = dataset
176
+
177
+ test_dataset = DatasetDict(datasets)
178
+ return test_dataset, None
179
+
180
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
181
+ """Convert a data record to a Sample object."""
182
+ return Sample(
183
+ input=record['question'],
184
+ target=record['answer'],
185
+ metadata={
186
+ 'context': record['context'],
187
+ 'context_length': record['context_length'],
188
+ 'depth_percent': record['depth_percent'],
189
+ }
190
+ )
191
+
192
+ def format_prompt_template(self, sample):
193
+ """Format the prompt template with context and question."""
194
+ context = sample.metadata['context']
195
+ question = sample.input
196
+ return self.prompt_template.format(context=context, question=question)
197
+
198
+ def _get_context_tokens(self, input_context: str) -> list:
199
+ """
200
+ Encodes the context string into tokens using the tokenizer, ensuring the tokenized context
201
+ is at least as long as the maximum context length required.
202
+
203
+ Args:
204
+ input_context (str): The context string to be tokenized.
205
+
206
+ Returns:
207
+ List[int]: A list of token IDs representing the context.
208
+ """
209
+ max_context_length = max(self.context_lengths)
210
+ context = input_context
211
+ tokens_context = self.tokenizer.encode(context, add_special_tokens=False)
212
+ # Repeat the context until reaching the required length
213
+ while len(tokens_context) < max_context_length:
214
+ context += '\n' + input_context
215
+ tokens_context = self.tokenizer.encode(context, add_special_tokens=False)
216
+ return tokens_context
217
+
218
+ def _insert_needles(self, tokens_context, depth_percent, context_length):
219
+ """
220
+ Inserts multiple needles (specific facts or pieces of information) into the original context string at
221
+ designated depth percentages, effectively distributing these needles throughout the context. This method
222
+ is designed to test a model's ability to retrieve specific information (needles) from a larger body of text
223
+ (haystack) based on the placement depth of these needles.
224
+
225
+ The method first encodes the context and each needle into tokens to calculate their lengths in tokens.
226
+ It then adjusts the context length to accommodate the final buffer length. This is crucial for ensuring
227
+ that the total token count (context plus needles) does not exceed the maximum allowable context length,
228
+ which might otherwise lead to information being truncated.
229
+
230
+ This approach calculates the initial insertion point for the first needle as before but then calculates even
231
+ spacing for the remaining needles based on the remaining context length. It ensures that needles are
232
+ distributed as evenly as possible throughout the context after the first insertion.
233
+
234
+ Args:
235
+ tokens_context (List[int]): The original context tokens.
236
+ depth_percent (float): The depth percent at which to insert the needles.
237
+ context_length (int): The total length of the context in tokens, adjusted for final buffer.
238
+
239
+ Returns:
240
+ str: The new context with needles inserted.
241
+ """
242
+
243
+ context_length -= 150
244
+
245
+ # Calculate the total length of all needles in tokens
246
+ total_needles_length = sum(len(self.tokenizer.encode(needle)) for needle in self.needles)
247
+
248
+ # Ensure context length accounts for needles
249
+ if len(tokens_context) + total_needles_length > context_length:
250
+ tokens_context = tokens_context[:context_length - total_needles_length]
251
+
252
+ # To evenly distribute the needles, we calculate the intervals they need to be inserted.
253
+ depth_percent_interval = (100 - depth_percent) / len(self.needles)
254
+
255
+ # Reset the insertion percentages list for the current context
256
+ self.insertion_percentages = []
257
+
258
+ # Insert needles at calculated points
259
+ for needle in self.needles:
260
+
261
+ tokens_needle = self.tokenizer.encode(needle)
262
+
263
+ if depth_percent == 100:
264
+ # If your depth percent is 100 (which means your needle is the last thing in the doc),
265
+ # throw it at the end
266
+ tokens_context = tokens_context + tokens_needle
267
+ else:
268
+ # Go get the position (in terms of tokens) to insert your needle
269
+ insertion_point = int(len(tokens_context) * (depth_percent / 100))
270
+
271
+ # tokens_new_context represents the tokens before the needle
272
+ tokens_new_context = tokens_context[:insertion_point]
273
+
274
+ # We want to make sure that we place our needle at a sentence break
275
+ # so we first see what token a '.' is
276
+ period_tokens = self.tokenizer.encode('.') + self.tokenizer.encode(
277
+ '。'
278
+ ) # Handle both English and Chinese periods
279
+
280
+ # Then we iteration backwards until we find the first period
281
+ while tokens_new_context and tokens_new_context[-1] not in period_tokens:
282
+ insertion_point -= 1
283
+ tokens_new_context = tokens_context[:insertion_point]
284
+
285
+ # Insert the needle into the context at the found position
286
+ tokens_context = tokens_context[:insertion_point] + tokens_needle + tokens_context[insertion_point:]
287
+
288
+ # Log
289
+ insertion_percentage = (insertion_point / len(tokens_context)) * 100
290
+ self.insertion_percentages.append(insertion_percentage)
291
+ logger.debug(
292
+ f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
293
+ f'total length now: {len(tokens_context)} tokens'
294
+ )
295
+
296
+ # Adjust depth for next needle
297
+ depth_percent += depth_percent_interval
298
+
299
+ new_context = self.tokenizer.decode(tokens_context)
300
+ return new_context
301
+
302
+ def match_score(
303
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
304
+ ) -> Score:
305
+ """Calculate evaluation scores by comparing prediction with reference."""
306
+ from evalscope.metrics import exact_match
307
+ from .utils import normalize_answer
308
+
309
+ score = Score(
310
+ extracted_prediction=filtered_prediction,
311
+ prediction=original_prediction,
312
+ )
313
+
314
+ # Get metadata from task state
315
+ context_length = task_state.metadata.get('context_length', 0)
316
+ depth_percent = task_state.metadata.get('depth_percent', 0)
317
+
318
+ norm_gold = normalize_answer(reference)
319
+ norm_pred = normalize_answer(filtered_prediction)
320
+ accuracy = exact_match(gold=norm_gold, pred=norm_pred)
321
+
322
+ metric_name = f'Context#{context_length} Depth#{depth_percent}'
323
+ score.value = {metric_name: accuracy}
324
+ score.main_score_name = metric_name
325
+
326
+ return score
327
+
328
+ def llm_match_score(
329
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
330
+ ) -> Score:
331
+ """Use LLM as a judge to evaluate the predicted answer against the gold answer."""
332
+ from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE, parse_score
333
+
334
+ score = Score(
335
+ extracted_prediction=filtered_prediction,
336
+ prediction=original_prediction,
337
+ )
338
+
339
+ # Get metadata from task state
340
+ context_length = task_state.metadata.get('context_length', 0)
341
+ depth_percent = task_state.metadata.get('depth_percent', 0)
342
+ question = task_state.input_text
343
+
344
+ # Get grading response
345
+ prompt = ORM_USER_TEMPLATE.format(question=question, gold=reference, pred=filtered_prediction)
346
+ orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
347
+
348
+ # Parse grading score with regex, [[score]]
349
+ accuracy = parse_score(orm_response) if orm_response else 0.0
350
+
351
+ metric_name = f'Context#{context_length} Depth#{depth_percent}'
352
+ score.value = {metric_name: accuracy}
353
+ score.explanation = f'LLM judge: {orm_response}'
354
+ score.metadata = {
355
+ 'source': 'llm_judge',
356
+ 'judge_strategy': getattr(self, 'judge_strategy', 'default'),
357
+ 'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown'
358
+ }
359
+ score.main_score_name = metric_name
360
+
361
+ return score
362
+
363
+ def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
364
+ try:
365
+ import os
366
+
367
+ from .utils import draw_score_chat
368
+
369
+ report_path = output_dir
370
+ data_frame = report.to_dataframe()
371
+ # split `Metric` to `Context` and `Depth`
372
+ data_frame[['Context', 'Depth']] = data_frame['Metric'].str.split(' ', n=1, expand=True)
373
+ data_frame['Depth'] = data_frame['Depth'].str.replace('Depth#', '').astype(float)
374
+ data_frame['Context'] = data_frame['Context'].str.replace('Context#', '').astype(int)
375
+ # split by `Subset` to multi sub data frame
376
+ for subset in data_frame['Subset'].unique():
377
+ sub_df = data_frame[data_frame['Subset'] == subset]
378
+ # draw charts for each subset
379
+ pivot_table = sub_df.pivot_table(values='Score', index=['Depth', 'Context'],
380
+ aggfunc='mean').reset_index()
381
+ pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
382
+ draw_score_chat(
383
+ pivot_table,
384
+ outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
385
+ show_score=self.show_score
386
+ )
387
+
388
+ except Exception as e:
389
+ logger.error(f'Error generating charts: {e}')
@@ -0,0 +1,79 @@
1
+ import matplotlib.pyplot as plt
2
+ import os
3
+ import re
4
+ import seaborn as sns
5
+ import string
6
+ from matplotlib.colors import LinearSegmentedColormap
7
+
8
+
9
+ def normalize_answer(s):
10
+
11
+ def remove_articles(text):
12
+ return re.sub(r'\b(a|an|the)\b', ' ', text)
13
+
14
+ def white_space_fix(text):
15
+ return ' '.join(text.split())
16
+
17
+ def remove_punc(text):
18
+ exclude = set(string.punctuation)
19
+ return ''.join(ch for ch in text if ch not in exclude)
20
+
21
+ def lower(text):
22
+ return text.lower()
23
+
24
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
25
+
26
+
27
+ def parse_score(score_str: str) -> int:
28
+ """
29
+ Parses a score string and returns an integer score.
30
+ The score should be in the format [[score]].
31
+ """
32
+ score_match = re.search(r'\[\[(\d+)\]\]', score_str)
33
+ if score_match:
34
+ score = int(score_match.group(1))
35
+ return score / 10.0
36
+ else:
37
+ return 0.0
38
+
39
+
40
+ def draw_score_chat(pivot_table, outpath, show_score=False):
41
+ # Create a custom colormap. Go to https://coolors.co/ and pick cool colors
42
+ cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
43
+
44
+ # Create the heatmap with better aesthetics
45
+ plt.figure(figsize=(17.5, 8)) # Can adjust these dimensions as needed
46
+ sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=show_score, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
47
+
48
+ # More aesthetics
49
+ plt.title('Fact Retrieval Across Context Lengths ("Needle In A HayStack")') # Adds a title
50
+ plt.xlabel('Token Limit') # X-axis label
51
+ plt.ylabel('Depth Percent') # Y-axis label
52
+ plt.xticks(rotation=45) # Rotates the x-axis labels to prevent overlap
53
+ plt.yticks(rotation=0) # Ensures the y-axis labels are horizontal
54
+ plt.tight_layout() # Fits everything neatly into the figure area
55
+
56
+ # save the figure
57
+ plt.savefig(outpath, dpi=300, bbox_inches='tight')
58
+
59
+
60
+ GENERAL_ORM_PROMPT = """You are an expert in verifying if the model answer is correct based on the reference answer.
61
+ Your input is a question, a reference answer, and a model answer. You need to check if the model answer is correct based on the reference answer.
62
+ You should focus on the correctness of the model answer compared to the reference answer, without attempting to solve the original question.
63
+ You must provide your final score in the form of a number from 1 to 10, where:
64
+
65
+ Score 1: The answer is completely unrelated to the reference.
66
+ Score 3: The answer has minor relevance but does not align with the reference.
67
+ Score 5: The answer has moderate relevance but contains inaccuracies.
68
+ Score 7: The answer aligns with the reference but has minor omissions.
69
+ Score 10: The answer is completely accurate and aligns perfectly with the reference.
70
+
71
+ Only respond with a numberical score with formatted as [[score]].""" # noqa: E501
72
+
73
+ ORM_USER_TEMPLATE = """
74
+ Question: {question}
75
+
76
+ Reference Answer: {gold}
77
+
78
+ Model Answer: {pred}
79
+ """
File without changes
@@ -0,0 +1,52 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'BroadTwitterCorpus is a dataset of tweets collected over stratified times, places '
8
+ 'and social uses. The goal is to represent a broad range of activities, giving a '
9
+ 'dataset more representative of the language used in this hardest of social media '
10
+ 'formats to process.'
11
+ )
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='broad_twitter_corpus',
17
+ pretty_name='BroadTwitterCorpus',
18
+ dataset_id='extraordinarylab/broad-twitter-corpus',
19
+ tags=[Tags.KNOWLEDGE, Tags.NER],
20
+ description=DESCRIPTION.strip(),
21
+ few_shot_num=5,
22
+ train_split='train',
23
+ eval_split='test',
24
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
25
+ prompt_template=PROMPT_TEMPLATE,
26
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
27
+ )
28
+ )
29
+ class BroadTwitterCorpusAdapter(NERAdapter):
30
+ """
31
+ Adapter for the BroadTwitterCorpus Named Entity Recognition dataset.
32
+
33
+ This adapter inherits the NER functionality from NERAdapter and
34
+ configures it specifically for the BroadTwitterCorpus dataset's entity types.
35
+ """
36
+
37
+ def __init__(self, **kwargs):
38
+ # Initialize the parent class first
39
+ super().__init__(**kwargs)
40
+
41
+ # Define BroadTwitterCorpus-specific entity mappings
42
+ self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location'}
43
+
44
+ # Add descriptions for each entity type
45
+ self.entity_descriptions = {
46
+ 'PER': 'Names of people, including first and last names',
47
+ 'ORG': 'Names of companies, institutions, organizations, etc.',
48
+ 'LOC': 'Names of locations, cities, states, countries, etc.',
49
+ }
50
+
51
+ # Setup entity mappings based on the defined entity types
52
+ self.setup_entity_mappings()
@@ -0,0 +1,48 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+
7
+ @register_benchmark(
8
+ BenchmarkMeta(
9
+ name='conll2003',
10
+ pretty_name='CoNLL2003',
11
+ dataset_id='evalscope/conll2003',
12
+ tags=[Tags.KNOWLEDGE, Tags.NER],
13
+ description='The ConLL-2003 dataset is for the Named Entity Recognition (NER) task. It was introduced as part '
14
+ 'of the ConLL-2003 Shared Task conference and contains texts annotated with entities such as '
15
+ 'people, organizations, places, and various names.',
16
+ few_shot_num=5,
17
+ train_split='train',
18
+ eval_split='test',
19
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
20
+ prompt_template=PROMPT_TEMPLATE,
21
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
22
+ )
23
+ )
24
+ class CoNLL2003Adapter(NERAdapter):
25
+ """
26
+ Adapter for the CoNLL2003 Named Entity Recognition dataset.
27
+
28
+ This adapter inherits the NER functionality from NERAdapter and
29
+ configures it specifically for the CoNLL2003 dataset's entity types.
30
+ """
31
+
32
+ def __init__(self, **kwargs):
33
+ # Initialize the parent class first
34
+ super().__init__(**kwargs)
35
+
36
+ # Define CoNLL2003-specific entity mappings
37
+ self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location', 'MISC': 'miscellaneous'}
38
+
39
+ # Add descriptions for each entity type
40
+ self.entity_descriptions = {
41
+ 'PER': 'Names of people, including first and last names',
42
+ 'ORG': 'Names of companies, institutions, organizations, etc.',
43
+ 'LOC': 'Names of locations, cities, states, countries, etc.',
44
+ 'MISC': 'Miscellaneous entities not in the above categories'
45
+ }
46
+
47
+ # Setup entity mappings based on the defined entity types
48
+ self.setup_entity_mappings()
@@ -0,0 +1,85 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'Copious corpus is a gold standard corpus that covers a wide range of biodiversity '
8
+ 'entities, consisting of 668 documents downloaded from the Biodiversity Heritage '
9
+ 'Library with over 26K sentences and more than 28K entities.'
10
+ )
11
+
12
+
13
+ @register_benchmark(
14
+ BenchmarkMeta(
15
+ name='copious',
16
+ pretty_name='Copious',
17
+ dataset_id='extraordinarylab/copious',
18
+ tags=[Tags.KNOWLEDGE, Tags.NER],
19
+ description=DESCRIPTION.strip(),
20
+ few_shot_num=5,
21
+ train_split='train',
22
+ eval_split='test',
23
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
24
+ prompt_template=PROMPT_TEMPLATE,
25
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
26
+ )
27
+ )
28
+ class CopiousAdapter(NERAdapter):
29
+ """
30
+ Adapter for the Copious Named Entity Recognition dataset.
31
+
32
+ This adapter inherits the NER functionality from NERAdapter and
33
+ configures it specifically for the Copious dataset's entity types.
34
+ """
35
+
36
+ def __init__(self, **kwargs):
37
+ # Initialize the parent class first
38
+ super().__init__(**kwargs)
39
+
40
+ # Define Copious-specific entity mappings
41
+ self.entity_type_map = {
42
+ 'TAXON': 'taxon',
43
+ 'GEOGRAPHICAL_LOCATION': 'geographical_location',
44
+ 'HABITAT': 'habitat',
45
+ 'PERSON': 'person',
46
+ 'TEMPORAL_EXPRESSION': 'temporal_expression'
47
+ }
48
+
49
+ # Add descriptions for each entity type
50
+ self.entity_descriptions = {
51
+ 'TAXON': (
52
+ 'Mentions of taxonomic ranks such as species, genus, and family. '
53
+ 'This includes scientific names (e.g., "Salvelinus alpinus") and '
54
+ 'vernacular names (e.g., "flying fox"), but excludes general terms '
55
+ 'like "fish" or "birds" and microorganism names.'
56
+ ),
57
+ 'GEOGRAPHICAL_LOCATION': (
58
+ 'Identifiable points or areas on the planet, including continents, '
59
+ 'countries, cities, landforms, and bodies of water (e.g., "East coast '
60
+ 'of Mindoro", "Balayan Bay"). This also includes geographical '
61
+ 'coordinates (e.g., "13o 36\' 11\\" N.").'
62
+ ),
63
+ 'HABITAT': (
64
+ 'Descriptions of environments where organisms live. This includes '
65
+ 'natural environments (e.g., "Lowland forest", "subalpine calcareous '
66
+ 'pastures") and places where parasites or epiphytes reside (e.g., '
67
+ '"parasitic on Achillea holosericea"). It excludes habitat attributes '
68
+ 'like altitude or depth.'
69
+ ),
70
+ 'PERSON': (
71
+ 'Proper nouns referring to person names, including those in historical '
72
+ 'accounts or citations related to a species observation (e.g., "In 1905, '
73
+ '[Tattersall] follows..."). It excludes titles, general references like '
74
+ '"the researcher", and names that are part of a taxon\'s authority.'
75
+ ),
76
+ 'TEMPORAL_EXPRESSION': (
77
+ 'Spans of text referring to points in time. This includes specific dates '
78
+ '(e.g., "10 June 2013"), years, decades, seasons, and geochronological ages '
79
+ '(e.g., "late Pleistocene"). It excludes time-of-day information and dates '
80
+ 'within a taxon name\'s authority.'
81
+ )
82
+ }
83
+
84
+ # Setup entity mappings based on the defined entity types
85
+ self.setup_entity_mappings()