evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,120 @@
1
+ from typing import Any, Dict, List, Set, Tuple
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.benchmarks.ner.cross_ner_entities import ai, literature, music, politics, science
7
+ from evalscope.constants import Tags
8
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE, create_target_text
9
+
10
+ DESCRIPTION = (
11
+ 'CrossNER is a fully-labelled collected of named entity recognition (NER) data '
12
+ 'spanning over five diverse domains (AI, Literature, Music, Politics, Science).'
13
+ )
14
+
15
+
16
+ @register_benchmark(
17
+ BenchmarkMeta(
18
+ name='cross_ner',
19
+ pretty_name='CrossNER',
20
+ dataset_id='extraordinarylab/cross-ner',
21
+ subset_list=['ai', 'literature', 'music', 'politics', 'science'],
22
+ tags=[Tags.KNOWLEDGE, Tags.NER],
23
+ description=DESCRIPTION.strip(),
24
+ few_shot_num=5,
25
+ train_split='train',
26
+ eval_split='test',
27
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
28
+ prompt_template=PROMPT_TEMPLATE,
29
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
30
+ )
31
+ )
32
+ class CrossNERAdapter(NERAdapter):
33
+ """
34
+ Adapter for the CrossNER Named Entity Recognition dataset.
35
+
36
+ This adapter inherits the NER functionality from NERAdapter and
37
+ configures it specifically for the CrossNER dataset's entity types.
38
+ """
39
+
40
+ def __init__(self, **kwargs):
41
+ # Initialize the parent class first
42
+ super().__init__(**kwargs)
43
+
44
+ # Define CrossNER-specific entity mappings
45
+ self.entity_type_map = {}
46
+
47
+ # Add descriptions for each entity type
48
+ self.entity_descriptions = {}
49
+
50
+ def setup_entity_mappings(self):
51
+ """
52
+ Setup entity mappings and descriptions for prompt formatting.
53
+ This should be called after entity_type_map and entity_descriptions are defined.
54
+ """
55
+ if self.current_subset_name == 'ai':
56
+ self.entity_type_map, self.entity_descriptions = ai.get_entity_mappings()
57
+ elif self.current_subset_name == 'literature':
58
+ self.entity_type_map, self.entity_descriptions = literature.get_entity_mappings()
59
+ elif self.current_subset_name == 'music':
60
+ self.entity_type_map, self.entity_descriptions = music.get_entity_mappings()
61
+ elif self.current_subset_name == 'politics':
62
+ self.entity_type_map, self.entity_descriptions = politics.get_entity_mappings()
63
+ elif self.current_subset_name == 'science':
64
+ self.entity_type_map, self.entity_descriptions = science.get_entity_mappings()
65
+
66
+ # Reverse mapping for converting back from prediction to evaluation
67
+ self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
68
+
69
+ # Create list of tags for prompt formatting
70
+ self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
71
+
72
+ # Create description of entities for prompt
73
+ self.entities_description = ', '.join([
74
+ f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
75
+ ])
76
+
77
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
78
+ """
79
+ Convert a record with tokens and NER tags into a Sample.
80
+ Creates both the raw text input and annotated text target.
81
+ """
82
+ # Setup entity mappings based on the defined entity types
83
+ self.setup_entity_mappings()
84
+
85
+ tokens: List[str] = record['tokens']
86
+ ner_tags: List[str] = record['ner_tags']
87
+
88
+ # Create the input text by joining tokens
89
+ input_text = ' '.join(tokens)
90
+
91
+ # Process tokens and tags to create annotated target text
92
+ target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
93
+
94
+ # Store tokens and tags in metadata for evaluation
95
+ metadata = {'tokens': tokens, 'ner_tags': ner_tags}
96
+
97
+ return Sample(input=input_text, target=target_text, metadata=metadata)
98
+
99
+ def format_prompt_template(self, sample):
100
+ """
101
+ Format the prompt with entity types, available tags, and text to annotate.
102
+ """
103
+ # Setup entity mappings based on the defined entity types
104
+ self.setup_entity_mappings()
105
+ return self.prompt_template.format(
106
+ entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
107
+ )
108
+
109
+ def format_fewshot_template(self, fewshot, sample):
110
+ """
111
+ Format the few-shot prompt with all required parameters.
112
+ """
113
+ # Setup entity mappings based on the defined entity types
114
+ self.setup_entity_mappings()
115
+ return self.few_shot_prompt_template.format(
116
+ fewshot=fewshot,
117
+ entities=self.entities_description,
118
+ entity_list=', '.join(self.entity_list),
119
+ text=sample.input
120
+ )
@@ -0,0 +1,54 @@
1
+ def get_entity_mappings():
2
+ entity_type_map = {
3
+ 'ALGORITHM': 'algorithm',
4
+ 'CONFERENCE': 'conference',
5
+ 'COUNTRY': 'country',
6
+ 'FIELD': 'field',
7
+ 'LOCATION': 'location',
8
+ 'METRICS': 'metrics',
9
+ 'MISC': 'misc',
10
+ 'ORGANISATION': 'organisation',
11
+ 'PERSON': 'person',
12
+ 'PRODUCT': 'product',
13
+ 'PROGRAMLANG': 'programming_language',
14
+ 'RESEARCHER': 'researcher',
15
+ 'TASK': 'task',
16
+ 'UNIVERSITY': 'university'
17
+ }
18
+ entity_descriptions = {
19
+ 'ALGORITHM':
20
+ ('A specific algorithm or model architecture in AI (e.g., "Transformer", '
21
+ '"gradient descent", "ResNet").'),
22
+ 'CONFERENCE': ('An academic conference related to AI (e.g., "NeurIPS", "ICML", "CVPR").'),
23
+ 'COUNTRY': ('A country mentioned in the context of AI research or development '
24
+ '(e.g., "USA", "China").'),
25
+ 'FIELD':
26
+ ('A sub-field or area of study within AI (e.g., "Natural Language Processing", '
27
+ '"Computer Vision").'),
28
+ 'LOCATION':
29
+ ('A specific geographical location relevant to AI, other than countries '
30
+ '(e.g., "Silicon Valley").'),
31
+ 'METRICS': ('A performance metric used to evaluate AI models (e.g., "F1-score", '
32
+ '"BLEU", "accuracy").'),
33
+ 'MISC': ('Miscellaneous AI-related terms that don\'t fit other categories '
34
+ '(e.g., "Turing Award").'),
35
+ 'ORGANISATION':
36
+ ('An organization, company, or lab involved in AI (e.g., "Google AI", '
37
+ '"OpenAI", "DeepMind").'),
38
+ 'PERSON':
39
+ ('A person mentioned in the context of AI, who is not a researcher '
40
+ '(e.g., a CEO or public figure).'),
41
+ 'PRODUCT': ('An AI-related product, framework, or software (e.g., "TensorFlow", '
42
+ '"PyTorch", "AlphaGo").'),
43
+ 'PROGRAMLANG': ('A programming language used in AI (e.g., "Python", "C++", "Julia").'),
44
+ 'RESEARCHER': ('A person who conducts research in the field of AI (e.g., "Yann LeCun", '
45
+ '"Geoffrey Hinton").'),
46
+ 'TASK': (
47
+ 'A specific problem or task that AI is used to solve (e.g., "Image Classification", '
48
+ '"Sentiment Analysis").'
49
+ ),
50
+ 'UNIVERSITY':
51
+ ('A university or academic institution involved in AI research (e.g., '
52
+ '"Stanford University", "MIT").')
53
+ }
54
+ return entity_type_map, entity_descriptions
@@ -0,0 +1,36 @@
1
+ def get_entity_mappings():
2
+ entity_type_map = {
3
+ 'AWARD': 'award',
4
+ 'BOOK': 'book',
5
+ 'COUNTRY': 'country',
6
+ 'EVENT': 'event',
7
+ 'LITERARYGENRE': 'literary_genre',
8
+ 'LOCATION': 'location',
9
+ 'MAGAZINE': 'magazine',
10
+ 'MISC': 'misc',
11
+ 'ORGANISATION': 'organisation',
12
+ 'PERSON': 'person',
13
+ 'POEM': 'poem',
14
+ 'WRITER': 'writer'
15
+ }
16
+ entity_descriptions = {
17
+ 'AWARD': ('A literary award or prize (e.g., "Nobel Prize in Literature", "Booker Prize").'),
18
+ 'BOOK': ('The title of a book (e.g., "Pride and Prejudice", "One Hundred Years of Solitude").'),
19
+ 'COUNTRY': ('A country relevant to the literary context (e.g., "England", "Russia").'),
20
+ 'EVENT': ('A literary festival or significant event (e.g., "Hay Festival", "Frankfurt Book Fair").'),
21
+ 'LITERARYGENRE':
22
+ ('A genre or category of literature (e.g., "Science Fiction", "Gothic novel", '
23
+ '"magical realism").'),
24
+ 'LOCATION': ('A real or fictional place mentioned in a literary context (e.g., "London", '
25
+ '"Middle-earth").'),
26
+ 'MAGAZINE': ('A magazine or literary journal (e.g., "The New Yorker", "Paris Review").'),
27
+ 'MISC': ('Miscellaneous literary terms (e.g., "protagonist", "sonnet", '
28
+ '"Shakespeare\'s Globe").'),
29
+ 'ORGANISATION': ('A publishing house or literary organization (e.g., "Penguin Random House").'),
30
+ 'PERSON': ('A character or person mentioned who is not a writer (e.g., "Elizabeth Bennet", '
31
+ '"King Lear").'),
32
+ 'POEM': ('The title of a poem (e.g., "The Waste Land", "Ozymandias").'),
33
+ 'WRITER': ('The name of a writer, author, or poet (e.g., "Jane Austen", '
34
+ '"Gabriel Garcia Marquez").')
35
+ }
36
+ return entity_type_map, entity_descriptions
@@ -0,0 +1,39 @@
1
+ def get_entity_mappings():
2
+ entity_type_map = {
3
+ 'ALBUM': 'album',
4
+ 'AWARD': 'award',
5
+ 'BAND': 'band',
6
+ 'COUNTRY': 'country',
7
+ 'EVENT': 'event',
8
+ 'LOCATION': 'location',
9
+ 'MISC': 'misc',
10
+ 'MUSICALARTIST': 'musical_artist',
11
+ 'MUSICALINSTRUMENT': 'musical_instrument',
12
+ 'MUSICGENRE': 'music_genre',
13
+ 'ORGANISATION': 'organisation',
14
+ 'PERSON': 'person',
15
+ 'SONG': 'song'
16
+ }
17
+ entity_descriptions = {
18
+ 'ALBUM': ('The title of a music album (e.g., "Abbey Road", "Thriller", "Lemonade").'),
19
+ 'AWARD': ('A music award or prize (e.g., "Grammy Award", "MTV Music Award").'),
20
+ 'BAND': ('The name of a musical group or band (e.g., "The Beatles", "Queen", "BTS").'),
21
+ 'COUNTRY': ('A country relevant to the music context (e.g., "USA", "UK", "South Korea").'),
22
+ 'EVENT': ('A music festival, concert tour, or event (e.g., "Glastonbury Festival", '
23
+ '"Woodstock").'),
24
+ 'LOCATION':
25
+ ('A venue, studio, or place relevant to music (e.g., "Madison Square Garden", '
26
+ '"Abbey Road Studios").'),
27
+ 'MISC': ('Miscellaneous music-related terms (e.g., "synthesizer", "major key", '
28
+ '"a cappella").'),
29
+ 'MUSICALARTIST': ('A solo musician or singer (e.g., "Michael Jackson", "Taylor Swift", '
30
+ '"Ed Sheeran").'),
31
+ 'MUSICALINSTRUMENT': ('A musical instrument (e.g., "guitar", "piano", "violin").'),
32
+ 'MUSICGENRE': ('A genre or style of music (e.g., "Rock", "Pop", "Jazz", "K-Pop").'),
33
+ 'ORGANISATION': ('A record label or music organization (e.g., "Capitol Records", "Sony Music").'),
34
+ 'PERSON':
35
+ ('A person related to music who is not a primary artist (e.g., a producer, '
36
+ 'a songwriter, "John Lennon").'),
37
+ 'SONG': ('The title of a song (e.g., "Bohemian Rhapsody", "Hey Jude", "Dynamite").')
38
+ }
39
+ return entity_type_map, entity_descriptions
@@ -0,0 +1,37 @@
1
+ def get_entity_mappings():
2
+ entity_type_map = {
3
+ 'COUNTRY': 'country',
4
+ 'ELECTION': 'election',
5
+ 'EVENT': 'event',
6
+ 'LOCATION': 'location',
7
+ 'MISC': 'misc',
8
+ 'ORGANISATION': 'organisation',
9
+ 'PERSON': 'person',
10
+ 'POLITICALPARTY': 'political_party',
11
+ 'POLITICIAN': 'politician'
12
+ }
13
+ entity_descriptions = {
14
+ 'COUNTRY': ('A country or sovereign state (e.g., "United States", "Germany").'),
15
+ 'ELECTION': ('A specific election event (e.g., "2024 presidential election", '
16
+ '"midterm elections").'),
17
+ 'EVENT':
18
+ ('A significant political event, summit, or incident (e.g., "G7 Summit", '
19
+ '"Brexit", "Watergate scandal").'),
20
+ 'LOCATION':
21
+ ('A politically significant building or location (e.g., "The White House", '
22
+ '"10 Downing Street").'),
23
+ 'MISC': (
24
+ 'Miscellaneous political terms, ideologies, or documents (e.g., "democracy", '
25
+ '"impeachment", "the Constitution").'
26
+ ),
27
+ 'ORGANISATION':
28
+ ('A political or governmental organization (e.g., "United Nations", "NATO", '
29
+ '"European Union").'),
30
+ 'PERSON':
31
+ ('A person mentioned in a political context who is not a politician '
32
+ '(e.g., a journalist, an activist).'),
33
+ 'POLITICALPARTY': ('A named political party (e.g., "Democratic Party", "Conservative Party").'),
34
+ 'POLITICIAN': ('A person who holds or seeks political office (e.g., "Joe Biden", '
35
+ '"Angela Merkel").')
36
+ }
37
+ return entity_type_map, entity_descriptions
@@ -0,0 +1,58 @@
1
+ def get_entity_mappings():
2
+ entity_type_map = {
3
+ 'ACADEMICJOURNAL': 'academic_journal',
4
+ 'ASTRONOMICALOBJECT': 'astronomical_object',
5
+ 'AWARD': 'award',
6
+ 'CHEMICALCOMPOUND': 'chemical_compound',
7
+ 'CHEMICALELEMENT': 'chemical_element',
8
+ 'COUNTRY': 'country',
9
+ 'DISCIPLINE': 'discipline',
10
+ 'ENZYME': 'enzyme',
11
+ 'EVENT': 'event',
12
+ 'LOCATION': 'location',
13
+ 'MISC': 'misc',
14
+ 'ORGANISATION': 'organisation',
15
+ 'PERSON': 'person',
16
+ 'PROTEIN': 'protein',
17
+ 'SCIENTIST': 'scientist',
18
+ 'THEORY': 'theory',
19
+ 'UNIVERSITY': 'university'
20
+ }
21
+ entity_descriptions = {
22
+ 'ACADEMICJOURNAL': ('A scientific journal or publication (e.g., "Nature", "Science", "The Lancet").'),
23
+ 'ASTRONOMICALOBJECT': ('A natural object in space (e.g., "Mars", "Andromeda Galaxy", '
24
+ '"Halley\'s Comet").'),
25
+ 'AWARD': ('A scientific award or prize (e.g., "Nobel Prize in Physics", "Fields Medal").'),
26
+ 'CHEMICALCOMPOUND':
27
+ ('A chemical substance consisting of two or more elements (e.g., "H2O", '
28
+ '"Carbon Dioxide").'),
29
+ 'CHEMICALELEMENT': ('An element from the periodic table (e.g., "Hydrogen", "Oxygen", "Gold").'),
30
+ 'COUNTRY': ('A country relevant to a scientific context (e.g., "Switzerland" for CERN).'),
31
+ 'DISCIPLINE':
32
+ ('A branch of science or academic discipline (e.g., "Physics", '
33
+ '"Molecular Biology", "Astronomy").'),
34
+ 'ENZYME': ('A specific type of protein that acts as a catalyst (e.g., "Lactase", "Catalase").'),
35
+ 'EVENT': ('A significant scientific mission or event (e.g., "Apollo 11 mission", '
36
+ '"Human Genome Project").'),
37
+ 'LOCATION':
38
+ ('A research facility or location of scientific importance (e.g., "CERN", '
39
+ '"International Space Station").'),
40
+ 'MISC':
41
+ ('Miscellaneous scientific terms or concepts (e.g., "double helix", '
42
+ '"black hole", "quantum mechanics").'),
43
+ 'ORGANISATION': ('A scientific organization or agency (e.g., "NASA", "Max Planck Society", "WHO").'),
44
+ 'PERSON':
45
+ ('A person mentioned in a scientific context who is not a scientist '
46
+ '(e.g., a patient, a benefactor).'),
47
+ 'PROTEIN': ('A specific protein (that is not an enzyme) (e.g., "Hemoglobin", '
48
+ '"Insulin", "Keratin").'),
49
+ 'SCIENTIST':
50
+ ('A person who is a scientist, researcher, or inventor (e.g., "Albert Einstein", '
51
+ '"Marie Curie").'),
52
+ 'THEORY': ('A named scientific theory or law (e.g., "Theory of Relativity", '
53
+ '"Big Bang Theory").'),
54
+ 'UNIVERSITY':
55
+ ('A university or academic institution involved in science (e.g., '
56
+ '"Cambridge University", "Caltech").')
57
+ }
58
+ return entity_type_map, entity_descriptions
@@ -0,0 +1,66 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'GeniaNER consisting of 2,000 MEDLINE abstracts has been released with more than '
8
+ '400,000 words and almost 100,000 annotations for biological terms.'
9
+ )
10
+
11
+
12
+ @register_benchmark(
13
+ BenchmarkMeta(
14
+ name='genia_ner',
15
+ pretty_name='GeniaNER',
16
+ dataset_id='extraordinarylab/genia-ner',
17
+ tags=[Tags.KNOWLEDGE, Tags.NER],
18
+ description=DESCRIPTION.strip(),
19
+ few_shot_num=5,
20
+ train_split='train',
21
+ eval_split='test',
22
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
23
+ prompt_template=PROMPT_TEMPLATE,
24
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
25
+ )
26
+ )
27
+ class GeniaNERAdapter(NERAdapter):
28
+ """
29
+ Adapter for the GeniaNER Named Entity Recognition dataset.
30
+
31
+ This adapter inherits the NER functionality from NERAdapter and
32
+ configures it specifically for the GeniaNER dataset's entity types.
33
+ """
34
+
35
+ def __init__(self, **kwargs):
36
+ # Initialize the parent class first
37
+ super().__init__(**kwargs)
38
+
39
+ # Define GeniaNER-specific entity mappings
40
+ self.entity_type_map = {
41
+ 'CELL_LINE': 'cell_line',
42
+ 'CELL_TYPE': 'cell_type',
43
+ 'DNA': 'dna',
44
+ 'PROTEIN': 'protein',
45
+ 'RNA': 'rna'
46
+ }
47
+
48
+ # Add descriptions for each entity type
49
+ self.entity_descriptions = {
50
+ 'CELL_LINE':
51
+ 'A population of cells derived from a single cell and grown in a culture.',
52
+ 'CELL_TYPE':
53
+ ('A category of cells that are part of a larger organism and share a specific '
54
+ 'structure and function.'),
55
+ 'DNA':
56
+ 'Deoxyribonucleic acid. This includes specific genes, domains, and regions of a DNA molecule.',
57
+ 'PROTEIN': (
58
+ 'Molecules composed of amino acids that perform a vast array of functions within '
59
+ 'organisms. This includes enzymes, receptors, and signaling molecules.'
60
+ ),
61
+ 'RNA':
62
+ 'Ribonucleic acid. This refers to RNA molecules, including messenger RNA (mRNA) and other types.'
63
+ }
64
+
65
+ # Setup entity mappings based on the defined entity types
66
+ self.setup_entity_mappings()
@@ -0,0 +1,58 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'HarveyNER is a dataset with fine-grained locations annotated in tweets. This dataset '
8
+ 'presents unique challenges and characterizes many complex and long location mentions '
9
+ 'in informal descriptions.'
10
+ )
11
+
12
+
13
+ @register_benchmark(
14
+ BenchmarkMeta(
15
+ name='harvey_ner',
16
+ pretty_name='HarveyNER',
17
+ dataset_id='extraordinarylab/harvey-ner',
18
+ tags=[Tags.KNOWLEDGE, Tags.NER],
19
+ description=DESCRIPTION.strip(),
20
+ few_shot_num=5,
21
+ train_split='train',
22
+ eval_split='test',
23
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
24
+ prompt_template=PROMPT_TEMPLATE,
25
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
26
+ )
27
+ )
28
+ class HarveyNERAdapter(NERAdapter):
29
+ """
30
+ Adapter for the HarveyNER Named Entity Recognition dataset.
31
+
32
+ This adapter inherits the NER functionality from NERAdapter and
33
+ configures it specifically for the HarveyNER dataset's entity types.
34
+ """
35
+
36
+ def __init__(self, **kwargs):
37
+ # Initialize the parent class first
38
+ super().__init__(**kwargs)
39
+
40
+ # Define HarveyNER-specific entity mappings
41
+ self.entity_type_map = {'AREA': 'area', 'POINT': 'point', 'RIVER': 'river', 'ROAD': 'road'}
42
+
43
+ # Add descriptions for each entity type
44
+ self.entity_descriptions = {
45
+ 'AREA':
46
+ 'Geographical entities such as city subdivisions, neighborhoods, etc.',
47
+ 'POINT': (
48
+ 'An exact location that a geocoordinate can be assigned. E.g., a uniquely named '
49
+ 'building, intersections of roads or rivers.'
50
+ ),
51
+ 'RIVER':
52
+ 'A river or a section of a river.',
53
+ 'ROAD':
54
+ 'A road or a section of a road.'
55
+ }
56
+
57
+ # Setup entity mappings based on the defined entity types
58
+ self.setup_entity_mappings()
@@ -0,0 +1,74 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'The MIT-Movie-Trivia dataset, originally created for slot filling, is modified by '
8
+ 'ignoring some slot types (e.g. genre, rating) and merging others (e.g. director '
9
+ 'and actor in person, and song and movie title in title) in order to keep '
10
+ 'consistent named entity types across all datasets.'
11
+ )
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='mit_movie_trivia',
17
+ pretty_name='MIT-Movie-Trivia',
18
+ dataset_id='extraordinarylab/mit-movie-trivia',
19
+ tags=[Tags.KNOWLEDGE, Tags.NER],
20
+ description=DESCRIPTION.strip(),
21
+ few_shot_num=5,
22
+ train_split='train',
23
+ eval_split='test',
24
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
25
+ prompt_template=PROMPT_TEMPLATE,
26
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
27
+ )
28
+ )
29
+ class MITMovieTriviaAdapter(NERAdapter):
30
+ """
31
+ Adapter for the MIT-Movie-Trivia Named Entity Recognition dataset.
32
+
33
+ This adapter inherits the NER functionality from NERAdapter and
34
+ configures it specifically for the MIT-Movie-Trivia dataset's entity types.
35
+ """
36
+
37
+ def __init__(self, **kwargs):
38
+ # Initialize the parent class first
39
+ super().__init__(**kwargs)
40
+
41
+ # Define MIT-Movie-Trivia-specific entity mappings
42
+ self.entity_type_map = {
43
+ 'ACTOR': 'actor',
44
+ 'AWARD': 'award',
45
+ 'CHARACTER_NAME': 'character_name',
46
+ 'DIRECTOR': 'director',
47
+ 'GENRE': 'genre',
48
+ 'OPINION': 'opinion',
49
+ 'ORIGIN': 'origin',
50
+ 'PLOT': 'plot',
51
+ 'QUOTE': 'quote',
52
+ 'RELATIONSHIP': 'relationship',
53
+ 'SOUNDTRACK': 'soundtrack',
54
+ 'YEAR': 'year'
55
+ }
56
+
57
+ # Add descriptions for each entity type
58
+ self.entity_descriptions = {
59
+ 'ACTOR': 'The name of an actor or actress starring in the movie.',
60
+ 'AWARD': 'An award the movie won or was nominated for.',
61
+ 'CHARACTER_NAME': 'The name of a character in the movie.',
62
+ 'DIRECTOR': 'The name of the person who directed the movie.',
63
+ 'GENRE': 'The category or style of the movie.',
64
+ 'OPINION': 'A subjective review or personal opinion about the movie.',
65
+ 'ORIGIN': 'The source material or basis for the movie.',
66
+ 'PLOT': 'A description or summary of the movie\'s storyline.',
67
+ 'QUOTE': 'A memorable line or phrase spoken in the movie.',
68
+ 'RELATIONSHIP': 'The connection or relationship between characters.',
69
+ 'SOUNDTRACK': 'The music or a specific song from the movie.',
70
+ 'YEAR': 'The release year of the movie.'
71
+ }
72
+
73
+ # Setup entity mappings based on the defined entity types
74
+ self.setup_entity_mappings()
@@ -0,0 +1,66 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'The MIT-Restaurant dataset is a collection of restaurant review text specifically '
8
+ 'curated for training and testing Natural Language Processing (NLP) models, '
9
+ 'particularly for Named Entity Recognition (NER). It contains sentences from real '
10
+ 'reviews, along with corresponding labels in the BIO format.'
11
+ )
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='mit_restaurant',
17
+ pretty_name='MIT-Restaurant',
18
+ dataset_id='extraordinarylab/mit-restaurant',
19
+ tags=[Tags.KNOWLEDGE, Tags.NER],
20
+ description=DESCRIPTION.strip(),
21
+ few_shot_num=5,
22
+ train_split='train',
23
+ eval_split='test',
24
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
25
+ prompt_template=PROMPT_TEMPLATE,
26
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
27
+ )
28
+ )
29
+ class MITRestaurantAdapter(NERAdapter):
30
+ """
31
+ Adapter for the MIT-Restaurant Named Entity Recognition dataset.
32
+
33
+ This adapter inherits the NER functionality from NERAdapter and
34
+ configures it specifically for the MIT-Restaurant dataset's entity types.
35
+ """
36
+
37
+ def __init__(self, **kwargs):
38
+ # Initialize the parent class first
39
+ super().__init__(**kwargs)
40
+
41
+ # Define MIT-Restaurant-specific entity mappings
42
+ self.entity_type_map = {
43
+ 'AMENITY': 'amenity',
44
+ 'CUISINE': 'cuisine',
45
+ 'DISH': 'dish',
46
+ 'HOURS': 'hours',
47
+ 'LOCATION': 'location',
48
+ 'PRICE': 'price',
49
+ 'RATING': 'rating',
50
+ 'RESTAURANT_NAME': 'restaurant_name'
51
+ }
52
+
53
+ # Add descriptions for each entity type
54
+ self.entity_descriptions = {
55
+ 'AMENITY': 'A feature or service offered by the restaurant.',
56
+ 'CUISINE': 'The type of food a restaurant serves.',
57
+ 'DISH': 'A specific food or drink item.',
58
+ 'HOURS': 'The operating hours of a restaurant.',
59
+ 'LOCATION': 'The address or general location of a restaurant.',
60
+ 'PRICE': 'The price range of a restaurant.',
61
+ 'RATING': 'A rating or review of the restaurant.',
62
+ 'RESTAURANT_NAME': 'The name of a restaurant.',
63
+ }
64
+
65
+ # Setup entity mappings based on the defined entity types
66
+ self.setup_entity_mappings()