evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,120 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict, List, Optional, Tuple
15
+
16
+ from evalscope.utils import get_logger
17
+ from . import ifeval
18
+
19
+ logger = get_logger()
20
+
21
+
22
+ def gen_acc_strict(x: Dict[str, Any]) -> Dict[str, List]:
23
+ # reference: fbcode/gen_ai/github/fair_evals/evals/tasks/finetune/ifeval.py
24
+ response = str(x['response'])
25
+ instruction_list = x['instruction_id_list']
26
+ is_following_list = []
27
+ for index, instruction_id in enumerate(instruction_list):
28
+ instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
29
+ instruction = instruction_cls(instruction_id)
30
+
31
+ instruction.build_description(**x['kwargs'][index])
32
+ if response and instruction.check_following(response):
33
+ is_following_list.append(True)
34
+ else:
35
+ is_following_list.append(False)
36
+
37
+ return {
38
+ 'follow_instruction_list': is_following_list,
39
+ 'instruction_id_list': instruction_list,
40
+ }
41
+
42
+
43
+ def gen_acc_loose(x: Dict[str, Any]) -> Dict[str, List]:
44
+ response = str(x['response'])
45
+ r = response.split('\n')
46
+ response_remove_first = '\n'.join(r[1:]).strip()
47
+ response_remove_last = '\n'.join(r[:-1]).strip()
48
+ response_remove_both = '\n'.join(r[1:-1]).strip()
49
+ revised_response = response.replace('*', '')
50
+ revised_response_remove_first = response_remove_first.replace('*', '')
51
+ revised_response_remove_last = response_remove_last.replace('*', '')
52
+ revised_response_remove_both = response_remove_both.replace('*', '')
53
+ all_responses = [
54
+ response,
55
+ revised_response,
56
+ response_remove_first,
57
+ response_remove_last,
58
+ response_remove_both,
59
+ revised_response_remove_first,
60
+ revised_response_remove_last,
61
+ revised_response_remove_both,
62
+ ]
63
+ instruction_list = x['instruction_id_list']
64
+ is_following_list = []
65
+ for index, instruction_id in enumerate(instruction_list):
66
+ instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
67
+ instruction = instruction_cls(instruction_id)
68
+
69
+ instruction.build_description(**x['kwargs'][index])
70
+
71
+ is_following = False
72
+ for r in all_responses: # type: ignore
73
+ if r.strip() and instruction.check_following(r): # type: ignore
74
+ is_following = True
75
+ break
76
+
77
+ is_following_list.append(is_following)
78
+ return {
79
+ 'follow_instruction_list': is_following_list,
80
+ 'instruction_id_list': instruction_list,
81
+ }
82
+
83
+
84
+ def parse_result(outputs: List[Dict[str, Any]]) -> Tuple[float, float]:
85
+
86
+ prompt_total = 0
87
+ prompt_correct = 0
88
+ instruction_total = 0
89
+ instruction_correct = 0
90
+
91
+ for example in outputs:
92
+ follow_instruction_list = example['follow_instruction_list']
93
+ instruction_id_list = example['instruction_id_list']
94
+
95
+ prompt_total += 1
96
+ if all(follow_instruction_list):
97
+ prompt_correct += 1
98
+
99
+ instruction_total += len(instruction_id_list)
100
+ instruction_correct += sum(follow_instruction_list)
101
+
102
+ return prompt_correct / prompt_total if prompt_total > 0 else 0, \
103
+ instruction_correct / instruction_total if instruction_total > 0 else 0
104
+
105
+
106
+ def parse_result_no_reduce(outputs: List[Dict[str, Any]]) -> Tuple[List, List]:
107
+
108
+ prompt_res = []
109
+ inst_res = []
110
+
111
+ for example in outputs:
112
+ follow_instruction_list = example['follow_instruction_list']
113
+ instruction_id_list = example['instruction_id_list']
114
+ if all(follow_instruction_list):
115
+ prompt_res.append(1)
116
+ else:
117
+ prompt_res.append(0)
118
+ inst_res.append(sum(follow_instruction_list) / len(instruction_id_list) if instruction_id_list else 0.0)
119
+
120
+ return prompt_res, inst_res
@@ -0,0 +1,161 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, messages_pretty_str
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.model import Model
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
12
+ from evalscope.utils.import_utils import check_import
13
+ from evalscope.utils.logger import get_logger
14
+
15
+ logger = get_logger()
16
+
17
+ SUBSET_LIST = [
18
+ 'Chinese',
19
+ 'English',
20
+ 'German',
21
+ 'Italian',
22
+ 'Vietnamese',
23
+ 'Spanish',
24
+ 'Hindi',
25
+ 'Portuguese',
26
+ 'French',
27
+ 'Thai',
28
+ 'Russian',
29
+ ]
30
+
31
+
32
+ @register_benchmark(
33
+ BenchmarkMeta(
34
+ name='multi_if',
35
+ pretty_name='Multi-IF',
36
+ description=
37
+ 'Multi-IF is a benchmark designed to evaluate the performance of LLM models\' capabilities in multi-turn instruction following within a multilingual environment.', # noqa: E501
38
+ tags=[Tags.INSTRUCTION_FOLLOWING, Tags.MULTI_LINGUAL, Tags.MULTI_TURN],
39
+ dataset_id='facebook/Multi-IF',
40
+ subset_list=SUBSET_LIST,
41
+ metric_list=[
42
+ 'prompt_level_strict',
43
+ 'inst_level_strict',
44
+ 'prompt_level_loose',
45
+ 'inst_level_loose',
46
+ ],
47
+ few_shot_num=0,
48
+ train_split=None,
49
+ eval_split='train',
50
+ extra_params={
51
+ 'max_turns': 3, # maximum number of turns to evaluate
52
+ }
53
+ )
54
+ )
55
+ class MultiIFAdapter(DefaultDataAdapter):
56
+
57
+ def __init__(self, **kwargs):
58
+ super().__init__(**kwargs)
59
+
60
+ # Ensure required packages are installed
61
+ check_import(
62
+ module_name=['nltk', 'langdetect'],
63
+ package=['nltk', 'langdetect'],
64
+ raise_error=True,
65
+ feature_name=self.pretty_name
66
+ )
67
+ if 'Chinese' in self.subset_list:
68
+ check_import(module_name='emoji', package='emoji', raise_error=True, feature_name='Chinese subset')
69
+ if 'Thai' in self.subset_list:
70
+ check_import(module_name='pythainlp', package='pythainlp', raise_error=True, feature_name='Thai subset')
71
+
72
+ self.reformat_subset = True
73
+ self.max_turns = self.extra_params.get('max_turns', 3)
74
+ if not isinstance(self.max_turns, int) or self.max_turns < 1 or self.max_turns > 3:
75
+ logger.warning(f'max_turns should be an integer between 1 and 3, got {self.max_turns}, clamping to 3.')
76
+ self.max_turns = 3
77
+
78
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
79
+ return Sample(
80
+ input=[ChatMessageUser(content='')], # NOTE: we will build the multi turn conversation in the evaluator
81
+ target='',
82
+ subset_key=record['language'],
83
+ metadata=record,
84
+ )
85
+
86
+ def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
87
+ """
88
+ Run multi-turn inference with the model and sample.
89
+ """
90
+ record = sample.metadata
91
+ history = []
92
+ step_record = {}
93
+ for step in range(1, self.max_turns + 1):
94
+ current_prompt = json.loads(record[f'turn_{step}_prompt'])
95
+ history.append(ChatMessageUser(content=current_prompt['content']))
96
+ # Generate model output
97
+ model_output = model.generate(input=history, tools=sample.tools)
98
+
99
+ response = model_output.completion
100
+ instruction_id_list = json.loads(record[f'turn_{step}_instruction_id_list'])
101
+ kwargs_list = json.loads(record[f'turn_{step}_kwargs'])
102
+ _kwargs = [json.loads(kwarg) for kwarg in kwargs_list]
103
+
104
+ step_record[step] = {
105
+ 'prompt': messages_pretty_str(history),
106
+ 'response': response,
107
+ 'instruction_id_list': instruction_id_list,
108
+ 'kwargs': _kwargs
109
+ }
110
+
111
+ # Append model output to history for next turn
112
+ history.append(model_output.message)
113
+
114
+ sample.metadata['step_record'] = step_record
115
+ return TaskState(
116
+ model=model.name,
117
+ sample=sample,
118
+ messages=history,
119
+ output=model_output,
120
+ completed=True,
121
+ )
122
+
123
+ def match_score(
124
+ self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
125
+ ) -> Score:
126
+ """
127
+ Calculate evaluation scores by comparing prediction with reference.
128
+ """
129
+ from .metrics import gen_acc_loose, gen_acc_strict, parse_result
130
+
131
+ # Initialize the score object with prediction details
132
+ score = Score(
133
+ extracted_prediction=filtered_prediction,
134
+ prediction=original_prediction,
135
+ )
136
+
137
+ step_record = task_state.metadata['step_record']
138
+ results = {}
139
+ try:
140
+ for step, record in step_record.items():
141
+ outputs_strict = gen_acc_strict(record)
142
+ outputs_loose = gen_acc_loose(record)
143
+ prompt_level_strict, inst_level_strict = parse_result([outputs_strict])
144
+ prompt_level_loose, inst_level_loose = parse_result([outputs_loose])
145
+ results.update({
146
+ f'turn_{step}_prompt_level_strict': prompt_level_strict,
147
+ f'turn_{step}_inst_level_strict': inst_level_strict,
148
+ f'turn_{step}_prompt_level_loose': prompt_level_loose,
149
+ f'turn_{step}_inst_level_loose': inst_level_loose,
150
+ })
151
+ score.value.update(results)
152
+
153
+ # Set main score name
154
+ if results:
155
+ score.main_score_name = f'turn_{step}_prompt_level_strict'
156
+
157
+ except Exception as e:
158
+ logger.error(f'Error calculating ifeval metrics: {e}')
159
+ score.value = {}
160
+
161
+ return score
File without changes
@@ -0,0 +1,36 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
2
+ from evalscope.api.dataset import Sample
3
+ from evalscope.api.registry import register_benchmark
4
+ from evalscope.constants import Tags
5
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
6
+
7
+ DESCRIPTION = (
8
+ 'MusicTrivia is a curated dataset of multiple-choice questions covering both classical and modern music topics. '
9
+ 'It includes questions about composers, musical periods, and popular artists, designed for evaluating '
10
+ 'factual recall and domain-specific music knowledge.'
11
+ ) # noqa: E501
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='music_trivia',
17
+ pretty_name='MusicTrivia',
18
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
19
+ description=DESCRIPTION.strip(),
20
+ dataset_id='extraordinarylab/music-trivia',
21
+ metric_list=['acc'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ prompt_template=MultipleChoiceTemplate.SINGLE_ANSWER,
26
+ )
27
+ )
28
+ class MusicTriviaAdapter(MultiChoiceAdapter):
29
+
30
+ def record_to_sample(self, record) -> Sample:
31
+ return Sample(
32
+ input=record['question'],
33
+ choices=record['choices'],
34
+ target=record['answer'],
35
+ metadata={},
36
+ )
@@ -36,7 +36,7 @@ Don't give information outside the document or repeat your findings."""
36
36
  tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
37
37
  description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
38
38
  'It requires the model to find specific information within a large corpus of text. '
39
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
39
+ '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html)', # noqa: E501
40
40
  dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
41
41
  metric_list=['acc'],
42
42
  subset_list=['english', 'chinese'],
@@ -73,6 +73,7 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
73
73
  super().__init__(**kwargs)
74
74
 
75
75
  self._use_llm_judge = True
76
+ self.add_aggregation_name = False # Don't add aggregation name for needle haystack adapter
76
77
  # set extra params
77
78
  self.retrieval_question = self.extra_params.get(
78
79
  'retrieval_question', 'What is the best thing to do in San Francisco?'
@@ -164,7 +165,11 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
164
165
  records.append(record)
165
166
 
166
167
  dataset = DictDataLoader(
167
- dict_list=records, limit=self.limit, repeats=self.repeats, sample_fields=self.record_to_sample
168
+ dict_list=records,
169
+ limit=self.limit,
170
+ repeats=self.repeats,
171
+ sample_fields=self.record_to_sample,
172
+ shuffle=self.shuffle,
168
173
  ).load()
169
174
 
170
175
  datasets[subset_name] = dataset
@@ -355,10 +360,6 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
355
360
 
356
361
  return score
357
362
 
358
- def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
359
- # Don't add aggregation name for needle haystack adapter
360
- return super()._on_generate_report(scores, model_name, False)
361
-
362
363
  def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
363
364
  try:
364
365
  import os
File without changes
@@ -0,0 +1,52 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'BroadTwitterCorpus is a dataset of tweets collected over stratified times, places '
8
+ 'and social uses. The goal is to represent a broad range of activities, giving a '
9
+ 'dataset more representative of the language used in this hardest of social media '
10
+ 'formats to process.'
11
+ )
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='broad_twitter_corpus',
17
+ pretty_name='BroadTwitterCorpus',
18
+ dataset_id='extraordinarylab/broad-twitter-corpus',
19
+ tags=[Tags.KNOWLEDGE, Tags.NER],
20
+ description=DESCRIPTION.strip(),
21
+ few_shot_num=5,
22
+ train_split='train',
23
+ eval_split='test',
24
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
25
+ prompt_template=PROMPT_TEMPLATE,
26
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
27
+ )
28
+ )
29
+ class BroadTwitterCorpusAdapter(NERAdapter):
30
+ """
31
+ Adapter for the BroadTwitterCorpus Named Entity Recognition dataset.
32
+
33
+ This adapter inherits the NER functionality from NERAdapter and
34
+ configures it specifically for the BroadTwitterCorpus dataset's entity types.
35
+ """
36
+
37
+ def __init__(self, **kwargs):
38
+ # Initialize the parent class first
39
+ super().__init__(**kwargs)
40
+
41
+ # Define BroadTwitterCorpus-specific entity mappings
42
+ self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location'}
43
+
44
+ # Add descriptions for each entity type
45
+ self.entity_descriptions = {
46
+ 'PER': 'Names of people, including first and last names',
47
+ 'ORG': 'Names of companies, institutions, organizations, etc.',
48
+ 'LOC': 'Names of locations, cities, states, countries, etc.',
49
+ }
50
+
51
+ # Setup entity mappings based on the defined entity types
52
+ self.setup_entity_mappings()
@@ -0,0 +1,48 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+
7
+ @register_benchmark(
8
+ BenchmarkMeta(
9
+ name='conll2003',
10
+ pretty_name='CoNLL2003',
11
+ dataset_id='evalscope/conll2003',
12
+ tags=[Tags.KNOWLEDGE, Tags.NER],
13
+ description='The ConLL-2003 dataset is for the Named Entity Recognition (NER) task. It was introduced as part '
14
+ 'of the ConLL-2003 Shared Task conference and contains texts annotated with entities such as '
15
+ 'people, organizations, places, and various names.',
16
+ few_shot_num=5,
17
+ train_split='train',
18
+ eval_split='test',
19
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
20
+ prompt_template=PROMPT_TEMPLATE,
21
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
22
+ )
23
+ )
24
+ class CoNLL2003Adapter(NERAdapter):
25
+ """
26
+ Adapter for the CoNLL2003 Named Entity Recognition dataset.
27
+
28
+ This adapter inherits the NER functionality from NERAdapter and
29
+ configures it specifically for the CoNLL2003 dataset's entity types.
30
+ """
31
+
32
+ def __init__(self, **kwargs):
33
+ # Initialize the parent class first
34
+ super().__init__(**kwargs)
35
+
36
+ # Define CoNLL2003-specific entity mappings
37
+ self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location', 'MISC': 'miscellaneous'}
38
+
39
+ # Add descriptions for each entity type
40
+ self.entity_descriptions = {
41
+ 'PER': 'Names of people, including first and last names',
42
+ 'ORG': 'Names of companies, institutions, organizations, etc.',
43
+ 'LOC': 'Names of locations, cities, states, countries, etc.',
44
+ 'MISC': 'Miscellaneous entities not in the above categories'
45
+ }
46
+
47
+ # Setup entity mappings based on the defined entity types
48
+ self.setup_entity_mappings()
@@ -0,0 +1,85 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'Copious corpus is a gold standard corpus that covers a wide range of biodiversity '
8
+ 'entities, consisting of 668 documents downloaded from the Biodiversity Heritage '
9
+ 'Library with over 26K sentences and more than 28K entities.'
10
+ )
11
+
12
+
13
+ @register_benchmark(
14
+ BenchmarkMeta(
15
+ name='copious',
16
+ pretty_name='Copious',
17
+ dataset_id='extraordinarylab/copious',
18
+ tags=[Tags.KNOWLEDGE, Tags.NER],
19
+ description=DESCRIPTION.strip(),
20
+ few_shot_num=5,
21
+ train_split='train',
22
+ eval_split='test',
23
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
24
+ prompt_template=PROMPT_TEMPLATE,
25
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
26
+ )
27
+ )
28
+ class CopiousAdapter(NERAdapter):
29
+ """
30
+ Adapter for the Copious Named Entity Recognition dataset.
31
+
32
+ This adapter inherits the NER functionality from NERAdapter and
33
+ configures it specifically for the Copious dataset's entity types.
34
+ """
35
+
36
+ def __init__(self, **kwargs):
37
+ # Initialize the parent class first
38
+ super().__init__(**kwargs)
39
+
40
+ # Define Copious-specific entity mappings
41
+ self.entity_type_map = {
42
+ 'TAXON': 'taxon',
43
+ 'GEOGRAPHICAL_LOCATION': 'geographical_location',
44
+ 'HABITAT': 'habitat',
45
+ 'PERSON': 'person',
46
+ 'TEMPORAL_EXPRESSION': 'temporal_expression'
47
+ }
48
+
49
+ # Add descriptions for each entity type
50
+ self.entity_descriptions = {
51
+ 'TAXON': (
52
+ 'Mentions of taxonomic ranks such as species, genus, and family. '
53
+ 'This includes scientific names (e.g., "Salvelinus alpinus") and '
54
+ 'vernacular names (e.g., "flying fox"), but excludes general terms '
55
+ 'like "fish" or "birds" and microorganism names.'
56
+ ),
57
+ 'GEOGRAPHICAL_LOCATION': (
58
+ 'Identifiable points or areas on the planet, including continents, '
59
+ 'countries, cities, landforms, and bodies of water (e.g., "East coast '
60
+ 'of Mindoro", "Balayan Bay"). This also includes geographical '
61
+ 'coordinates (e.g., "13o 36\' 11\\" N.").'
62
+ ),
63
+ 'HABITAT': (
64
+ 'Descriptions of environments where organisms live. This includes '
65
+ 'natural environments (e.g., "Lowland forest", "subalpine calcareous '
66
+ 'pastures") and places where parasites or epiphytes reside (e.g., '
67
+ '"parasitic on Achillea holosericea"). It excludes habitat attributes '
68
+ 'like altitude or depth.'
69
+ ),
70
+ 'PERSON': (
71
+ 'Proper nouns referring to person names, including those in historical '
72
+ 'accounts or citations related to a species observation (e.g., "In 1905, '
73
+ '[Tattersall] follows..."). It excludes titles, general references like '
74
+ '"the researcher", and names that are part of a taxon\'s authority.'
75
+ ),
76
+ 'TEMPORAL_EXPRESSION': (
77
+ 'Spans of text referring to points in time. This includes specific dates '
78
+ '(e.g., "10 June 2013"), years, decades, seasons, and geochronological ages '
79
+ '(e.g., "late Pleistocene"). It excludes time-of-day information and dates '
80
+ 'within a taxon name\'s authority.'
81
+ )
82
+ }
83
+
84
+ # Setup entity mappings based on the defined entity types
85
+ self.setup_entity_mappings()
@@ -0,0 +1,120 @@
1
+ from typing import Any, Dict, List, Set, Tuple
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.benchmarks.ner.cross_ner_entities import ai, literature, music, politics, science
7
+ from evalscope.constants import Tags
8
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE, create_target_text
9
+
10
+ DESCRIPTION = (
11
+ 'CrossNER is a fully-labelled collected of named entity recognition (NER) data '
12
+ 'spanning over five diverse domains (AI, Literature, Music, Politics, Science).'
13
+ )
14
+
15
+
16
+ @register_benchmark(
17
+ BenchmarkMeta(
18
+ name='cross_ner',
19
+ pretty_name='CrossNER',
20
+ dataset_id='extraordinarylab/cross-ner',
21
+ subset_list=['ai', 'literature', 'music', 'politics', 'science'],
22
+ tags=[Tags.KNOWLEDGE, Tags.NER],
23
+ description=DESCRIPTION.strip(),
24
+ few_shot_num=5,
25
+ train_split='train',
26
+ eval_split='test',
27
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
28
+ prompt_template=PROMPT_TEMPLATE,
29
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
30
+ )
31
+ )
32
+ class CrossNERAdapter(NERAdapter):
33
+ """
34
+ Adapter for the CrossNER Named Entity Recognition dataset.
35
+
36
+ This adapter inherits the NER functionality from NERAdapter and
37
+ configures it specifically for the CrossNER dataset's entity types.
38
+ """
39
+
40
+ def __init__(self, **kwargs):
41
+ # Initialize the parent class first
42
+ super().__init__(**kwargs)
43
+
44
+ # Define CrossNER-specific entity mappings
45
+ self.entity_type_map = {}
46
+
47
+ # Add descriptions for each entity type
48
+ self.entity_descriptions = {}
49
+
50
+ def setup_entity_mappings(self):
51
+ """
52
+ Setup entity mappings and descriptions for prompt formatting.
53
+ This should be called after entity_type_map and entity_descriptions are defined.
54
+ """
55
+ if self.current_subset_name == 'ai':
56
+ self.entity_type_map, self.entity_descriptions = ai.get_entity_mappings()
57
+ elif self.current_subset_name == 'literature':
58
+ self.entity_type_map, self.entity_descriptions = literature.get_entity_mappings()
59
+ elif self.current_subset_name == 'music':
60
+ self.entity_type_map, self.entity_descriptions = music.get_entity_mappings()
61
+ elif self.current_subset_name == 'politics':
62
+ self.entity_type_map, self.entity_descriptions = politics.get_entity_mappings()
63
+ elif self.current_subset_name == 'science':
64
+ self.entity_type_map, self.entity_descriptions = science.get_entity_mappings()
65
+
66
+ # Reverse mapping for converting back from prediction to evaluation
67
+ self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
68
+
69
+ # Create list of tags for prompt formatting
70
+ self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
71
+
72
+ # Create description of entities for prompt
73
+ self.entities_description = ', '.join([
74
+ f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
75
+ ])
76
+
77
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
78
+ """
79
+ Convert a record with tokens and NER tags into a Sample.
80
+ Creates both the raw text input and annotated text target.
81
+ """
82
+ # Setup entity mappings based on the defined entity types
83
+ self.setup_entity_mappings()
84
+
85
+ tokens: List[str] = record['tokens']
86
+ ner_tags: List[str] = record['ner_tags']
87
+
88
+ # Create the input text by joining tokens
89
+ input_text = ' '.join(tokens)
90
+
91
+ # Process tokens and tags to create annotated target text
92
+ target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
93
+
94
+ # Store tokens and tags in metadata for evaluation
95
+ metadata = {'tokens': tokens, 'ner_tags': ner_tags}
96
+
97
+ return Sample(input=input_text, target=target_text, metadata=metadata)
98
+
99
+ def format_prompt_template(self, sample):
100
+ """
101
+ Format the prompt with entity types, available tags, and text to annotate.
102
+ """
103
+ # Setup entity mappings based on the defined entity types
104
+ self.setup_entity_mappings()
105
+ return self.prompt_template.format(
106
+ entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
107
+ )
108
+
109
+ def format_fewshot_template(self, fewshot, sample):
110
+ """
111
+ Format the few-shot prompt with all required parameters.
112
+ """
113
+ # Setup entity mappings based on the defined entity types
114
+ self.setup_entity_mappings()
115
+ return self.few_shot_prompt_template.format(
116
+ fewshot=fewshot,
117
+ entities=self.entities_description,
118
+ entity_list=', '.join(self.entity_list),
119
+ text=sample.input
120
+ )