evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,254 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, ContentText
8
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.logger import get_logger
12
+ from evalscope.utils.multi_choices import parse_answers, prompt
13
+
14
+ logger = get_logger()
15
+
16
+ DESCRIPTION = (
17
+ 'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
18
+ 'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
19
+ 'or rhetorically subversive.'
20
+ )
21
+
22
+ MULTIPLE_ANSWER_TEMPLATE = r"""
23
+ #Instruction#:
24
+ Classify the given text into one or more of the following categories: inversion, wordplay, switchbait, paradox, and misdirection.
25
+
26
+ #Definitions#:
27
+ - inversion: This technique takes a well-known phrase, cliché, or social script and flips it on its head. The humour arises by reversing a familiar structure to creating a new, often satirical, meaning.
28
+ - wordplay: This is the use of linguistic creativity, often by exploiting the phonetics or polysemy of words. It includes puns, double entendres, and similarities.
29
+ - switchbait: This technique hinges on a specific phrase (the "bait") that has a culturally-embedded double meaning. The initial context is then suddenly replaced (the "switch") by a surprising second meaning. The humour is generated by this cynical or culturally-specific reinterpretation of the bait, rather than by derailing a narrative.
30
+ - paradox: This relies on a statement that appears logically self-contradictory but contains a latent, often humorous or profound truth. The core of the technique is the clash of seemingly incompatible ideas.
31
+ - misdirection: This technique leads the listener down an expected path before a final twist reveals a different, often more literal or absurd, ending.
32
+
33
+ Answer the following multiple choice question where multiple answers may be correct.
34
+ The entire content of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}.
35
+
36
+ {question}
37
+
38
+ {choices}
39
+ """.strip() # noqa: E501
40
+
41
+
42
+ @register_benchmark(
43
+ BenchmarkMeta(
44
+ name='drivel_multilabel',
45
+ pretty_name='DrivelologyMultilabelClassification',
46
+ tags=[Tags.MULTIPLE_CHOICE],
47
+ description=DESCRIPTION.strip(),
48
+ dataset_id='extraordinarylab/drivel-hub',
49
+ subset_list=['multi-label-classification'],
50
+ metric_list=['f1_weighted', 'f1_micro', 'f1_macro', 'exact_match'],
51
+ aggregation='f1_weighted',
52
+ eval_split='test',
53
+ prompt_template='{question}',
54
+ )
55
+ )
56
+ class DrivelologyMultilabelClassificationAdapter(DefaultDataAdapter):
57
+
58
+ def __init__(self, *args, **kwargs):
59
+ super().__init__(*args, **kwargs)
60
+ self.categories = ['inversion', 'wordplay', 'switchbait', 'paradox', 'misdirection']
61
+ self.choices = {'A': 'inversion', 'B': 'wordplay', 'C': 'switchbait', 'D': 'paradox', 'E': 'misdirection'}
62
+ self.categories_to_letters = {v: k for k, v in self.choices.items()}
63
+
64
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
65
+ text: str = record['text']
66
+ label: List[str] = record['label']
67
+ question = f'Text to classify: {text}'
68
+ choices_list = [f'{key}. {value}' for key, value in self.choices.items()]
69
+ input_text = prompt(question=question, choices=choices_list, template=MULTIPLE_ANSWER_TEMPLATE)
70
+ content_list = [ContentText(text=input_text)]
71
+ target_letters = ''.join(
72
+ sorted([self.categories_to_letters[cat] for cat in label if cat in self.categories_to_letters])
73
+ )
74
+ metadata = {'text': text, 'label': label, 'target_letters': target_letters}
75
+ return Sample(
76
+ input=[ChatMessageUser(content=content_list)],
77
+ choices=choices_list,
78
+ target=target_letters,
79
+ metadata=metadata,
80
+ )
81
+
82
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
83
+ pattern = r'ANSWER:\s*([A-E]+)'
84
+ match = re.search(pattern, prediction)
85
+ if match:
86
+ letters = match.group(1).strip().upper()
87
+ return ''.join(sorted(set(letters)))
88
+ else:
89
+ try:
90
+ answers = parse_answers(prediction)
91
+ return ''.join(sorted(list(answers)))
92
+ except Exception as e:
93
+ logger.warning(f'Could not extract answer from: {prediction}. Error: {e}')
94
+ return ''
95
+
96
+ def match_score(
97
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
98
+ ) -> Score:
99
+ """
100
+ Calculate the match score between the prediction and reference for multilabel classification.
101
+
102
+ Args:
103
+ original_prediction: The original model output
104
+ filtered_prediction: The extracted answer (letter format, e.g., "AC")
105
+ reference: The reference answer (letter format, e.g., "AC")
106
+ task_state: The current task state
107
+
108
+ Returns:
109
+ Score object with metrics
110
+ """
111
+ # Create a Score object as required by the API
112
+ score = Score(
113
+ extracted_prediction=filtered_prediction,
114
+ prediction=original_prediction,
115
+ )
116
+
117
+ # Convert letter answers to category sets
118
+ pred_categories = set(self.choices.get(letter, '') for letter in filtered_prediction)
119
+ target_categories = set(self.choices.get(letter, '') for letter in reference)
120
+
121
+ # Remove empty strings (may be caused by invalid letters)
122
+ pred_categories = {cat for cat in pred_categories if cat}
123
+ target_categories = {cat for cat in target_categories if cat}
124
+
125
+ # Calculate TP (true positives), FP (false positives), and FN (false negatives)
126
+ tp = len(pred_categories & target_categories) # intersection
127
+ fp = len(pred_categories - target_categories) # in prediction but not in target
128
+ fn = len(target_categories - pred_categories) # in target but not in prediction
129
+
130
+ # Calculate precision, recall and F1 score
131
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
132
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
133
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
134
+
135
+ # Calculate exact match (1.0 if prediction exactly matches target)
136
+ exact_match = 1.0 if pred_categories == target_categories else 0.0
137
+
138
+ # Store category information in metadata for later aggregation
139
+ category_data = {}
140
+ for cat in self.categories:
141
+ in_pred = cat in pred_categories
142
+ in_target = cat in target_categories
143
+
144
+ category_data[cat] = {
145
+ 'tp': 1 if in_pred and in_target else 0,
146
+ 'fp': 1 if in_pred and not in_target else 0,
147
+ 'fn': 1 if not in_pred and in_target else 0,
148
+ 'support': 1 if in_target else 0
149
+ }
150
+
151
+ # Set simple numerical values in score.value as expected by the API
152
+ score.value = {'f1': f1, 'precision': precision, 'recall': recall, 'exact_match': exact_match}
153
+
154
+ # Store category data in metadata for aggregation
155
+ score.metadata = {'category_data': category_data}
156
+
157
+ return score
158
+
159
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
160
+ """
161
+ Aggregate scores across all samples.
162
+ Computes weighted, macro, and micro F1 scores for multilabel classification.
163
+
164
+ Args:
165
+ sample_scores: List of sample scores
166
+
167
+ Returns:
168
+ List of aggregated scores
169
+ """
170
+ if not sample_scores:
171
+ return [
172
+ AggScore(metric_name='f1_weighted', score=0.0, num=0, metadata={}),
173
+ AggScore(metric_name='f1_micro', score=0.0, num=0, metadata={}),
174
+ AggScore(metric_name='f1_macro', score=0.0, num=0, metadata={}),
175
+ AggScore(metric_name='exact_match', score=0.0, num=0, metadata={})
176
+ ]
177
+
178
+ # Initialize category statistics
179
+ category_stats = {cat: {'tp': 0, 'fp': 0, 'fn': 0, 'support': 0} for cat in self.categories}
180
+ total_exact_matches = 0
181
+ num_samples = len(sample_scores)
182
+
183
+ # Aggregate statistics across all samples
184
+ for ss in sample_scores:
185
+ # Add exact match score to total
186
+ total_exact_matches += ss.score.value.get('exact_match', 0)
187
+
188
+ # Get category data from metadata
189
+ if 'category_data' in ss.score.metadata:
190
+ cat_data = ss.score.metadata['category_data']
191
+ for cat, stats in cat_data.items():
192
+ if cat in self.categories:
193
+ category_stats[cat]['tp'] += stats.get('tp', 0)
194
+ category_stats[cat]['fp'] += stats.get('fp', 0)
195
+ category_stats[cat]['fn'] += stats.get('fn', 0)
196
+ category_stats[cat]['support'] += stats.get('support', 0)
197
+
198
+ # Calculate F1 scores for each category
199
+ category_f1 = {}
200
+ total_support = sum(stats['support'] for stats in category_stats.values())
201
+ f1_sum = 0.0
202
+
203
+ for cat, stats in category_stats.items():
204
+ tp = stats['tp']
205
+ fp = stats['fp']
206
+ fn = stats['fn']
207
+
208
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
209
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
210
+ f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
211
+
212
+ category_f1[cat] = f1
213
+ f1_sum += f1
214
+
215
+ # Calculate micro-average F1 (based on aggregate TP, FP, FN)
216
+ total_tp = sum(stats['tp'] for stats in category_stats.values())
217
+ total_fp = sum(stats['fp'] for stats in category_stats.values())
218
+ total_fn = sum(stats['fn'] for stats in category_stats.values())
219
+
220
+ micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
221
+ micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
222
+ f1_micro = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (
223
+ micro_precision + micro_recall
224
+ ) > 0 else 0.0
225
+
226
+ # Calculate macro-average F1 (simple average of category F1 scores)
227
+ f1_macro = f1_sum / len(self.categories) if self.categories else 0.0
228
+
229
+ # Calculate weighted-average F1 (weighted by support)
230
+ f1_weighted = 0.0
231
+ if total_support > 0:
232
+ for cat, stats in category_stats.items():
233
+ cat_f1 = category_f1[cat]
234
+ weight = stats['support'] / total_support
235
+ f1_weighted += cat_f1 * weight
236
+
237
+ # Calculate accuracy (proportion of exact matches)
238
+ exact_match = total_exact_matches / num_samples
239
+
240
+ # Return list of aggregate scores
241
+ return [
242
+ AggScore(
243
+ metric_name='f1_weighted',
244
+ score=f1_weighted,
245
+ num=num_samples,
246
+ metadata={'category_f1': {
247
+ cat: f1
248
+ for cat, f1 in category_f1.items()
249
+ }}
250
+ ),
251
+ AggScore(metric_name='f1_micro', score=f1_micro, num=num_samples, metadata={}),
252
+ AggScore(metric_name='f1_macro', score=f1_macro, num=num_samples, metadata={}),
253
+ AggScore(metric_name='exact_match', score=exact_match, num=num_samples, metadata={})
254
+ ]
@@ -0,0 +1,49 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
2
+ from evalscope.api.dataset import Sample
3
+ from evalscope.api.registry import register_benchmark
4
+ from evalscope.constants import Tags
5
+
6
+ DESCRIPTION = (
7
+ 'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
8
+ 'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
9
+ 'or rhetorically subversive.'
10
+ )
11
+
12
+ PROMPT_TEMPLATE = r"""
13
+ Tell me the best option in the following options which represents the underlying narrative of the text?
14
+ The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
15
+
16
+ {question}
17
+
18
+ {choices}
19
+ """.strip() # noqa: E501
20
+
21
+
22
+ @register_benchmark(
23
+ BenchmarkMeta(
24
+ name='drivel_selection',
25
+ pretty_name='DrivelologyNarrativeSelection',
26
+ tags=[Tags.MULTIPLE_CHOICE],
27
+ description=DESCRIPTION.strip(),
28
+ dataset_id='extraordinarylab/drivel-hub',
29
+ subset_list=['multiple-choice-english-easy', 'multiple-choice-english-hard'],
30
+ metric_list=['acc'],
31
+ few_shot_num=0,
32
+ train_split=None,
33
+ eval_split='test',
34
+ prompt_template=PROMPT_TEMPLATE,
35
+ )
36
+ )
37
+ class DrivelologyNarrativeSelectionAdapter(MultiChoiceAdapter):
38
+
39
+ def __init__(self, **kwargs):
40
+ super().__init__(**kwargs)
41
+ self.add_overall_metric = False
42
+
43
+ def record_to_sample(self, record) -> Sample:
44
+ return Sample(
45
+ input=record['text'],
46
+ choices=record['choices'],
47
+ target=record['answer'],
48
+ metadata={},
49
+ )
@@ -0,0 +1,218 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, ContentText
8
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ DESCRIPTION = (
16
+ 'Drivelology, a unique linguistic phenomenon characterised as "nonsense with depth" - '
17
+ 'utterances that are syntactically coherent yet pragmatically paradoxical, emotionally loaded, '
18
+ 'or rhetorically subversive.'
19
+ )
20
+
21
+ # Keep the original generation and evaluation templates
22
+ NARRATIVE_GENERATION_TEMPLATE = """
23
+ You need to first read and understand the text given. Generate a detailed description to illustrate the implicit narrative of the text.
24
+
25
+ Please provide your response in English, with a clear and comprehensive explanation of the narrative.
26
+
27
+ Text: {text}
28
+ """.strip() # noqa: E501
29
+
30
+ NARRATIVE_EVALUATION_TEMPLATE = """
31
+ Please act as an impartial judge and evaluate how accurately the candidate narrative matches the given reference narrative.
32
+ Your evaluation should consider factors such as the relevance, accuracy, depth, and level of detail of the candidate narrative compared to the reference.
33
+
34
+ Begin your evaluation by providing a short explanation in English. Be as objective as possible.
35
+
36
+ After providing your explanation, you must rate the match on a Likert scale from 1 to 5, where:
37
+ 1 = Very poor match
38
+ 2 = Poor match
39
+ 3 = Moderate match
40
+ 4 = Good match
41
+ 5 = Excellent match
42
+
43
+ Please format your rating strictly as: "Rating: [[X]]" where X is a whole number from 1 to 5.
44
+
45
+ [Candidate Narrative]
46
+ {candidate}
47
+
48
+ [Reference Narrative]
49
+ {reference}
50
+ """.strip() # noqa: E501
51
+
52
+
53
+ @register_benchmark(
54
+ BenchmarkMeta(
55
+ name='drivel_writing',
56
+ pretty_name='DrivelologyNarrativeWriting',
57
+ tags=[Tags.KNOWLEDGE, Tags.REASONING],
58
+ description=DESCRIPTION.strip(),
59
+ dataset_id='extraordinarylab/drivel-hub',
60
+ subset_list=['narrative-writing-english'],
61
+ metric_list={
62
+ 'bert_score': {
63
+ 'model_id_or_path': 'AI-ModelScope/roberta-large',
64
+ 'model_type': 'roberta-large'
65
+ },
66
+ 'gpt_score': {}
67
+ },
68
+ few_shot_num=0,
69
+ train_split=None,
70
+ eval_split='test',
71
+ prompt_template=NARRATIVE_GENERATION_TEMPLATE
72
+ )
73
+ )
74
+ class DrivelologyNarrativeWritingAdapter(DefaultDataAdapter):
75
+
76
+ def __init__(self, *args, **kwargs):
77
+ super().__init__(*args, **kwargs)
78
+ self._use_llm_judge = True # Use LLM as a judge by default
79
+ self.use_batch_scoring = True # Enable batch scoring
80
+
81
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
82
+ """
83
+ Convert a data record to a Sample object.
84
+ """
85
+ text = record['text']
86
+ reference_narrative = record['narrative']
87
+
88
+ # Format the generation prompt with the text
89
+ input_prompt = NARRATIVE_GENERATION_TEMPLATE.format(text=text)
90
+
91
+ # Create content list for the input
92
+ content_list = [ContentText(text=input_prompt)]
93
+
94
+ return Sample(
95
+ input=[ChatMessageUser(content=content_list)],
96
+ target=reference_narrative,
97
+ metadata={
98
+ 'text': text,
99
+ 'reference_narrative': reference_narrative
100
+ }
101
+ )
102
+
103
+ def batch_match_score(self, original_predictions, filtered_predictions, references, task_states):
104
+ """
105
+ Batch calculate the match scores using BERTScore.
106
+ """
107
+ from evalscope.metrics.metric import BertScore
108
+
109
+ score_args = self.metric_list.get('bert_score', {})
110
+ bert_scorer = BertScore(**score_args)
111
+ bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
112
+ scores = []
113
+ for i in range(len(original_predictions)):
114
+ score = Score(
115
+ extracted_prediction=filtered_predictions[i],
116
+ prediction=original_predictions[i],
117
+ value={'bert_score': bert_score_f1[i]}
118
+ )
119
+ scores.append(score)
120
+ return scores
121
+
122
+ def llm_match_score(
123
+ self,
124
+ original_prediction: str,
125
+ filtered_prediction: str,
126
+ reference: str,
127
+ task_state: TaskState,
128
+ ) -> Score:
129
+ """
130
+ Calculate the match score using LLM judge and BERTScore.
131
+ """
132
+ score = Score(
133
+ extracted_prediction=filtered_prediction,
134
+ prediction=original_prediction,
135
+ )
136
+
137
+ # Initialize score value dictionary
138
+ score.value = {}
139
+
140
+ # Use LLM judge to evaluate narrative quality
141
+ eval_prompt = NARRATIVE_EVALUATION_TEMPLATE.format(candidate=filtered_prediction, reference=reference)
142
+
143
+ judge_response = self.llm_judge.judge(eval_prompt)
144
+ logger.info(f'LLM judge response received (first 100 chars): {judge_response[:100]}...')
145
+
146
+ # Extract rating using regex pattern
147
+ match = re.search(r'Rating:\s*\[\[([1-5])\]\]', judge_response)
148
+ if match:
149
+ rating = int(match.group(1))
150
+ gpt_score = (rating - 1) / 4.0 # Normalize to 0-1 scale
151
+ logger.info(f'Rating extracted: {rating}/5 -> {gpt_score}')
152
+ else:
153
+ # Try alternative pattern
154
+ alt_match = re.search(r'(\[\[|\[)([1-5])(\]\]|\])', judge_response)
155
+ if alt_match:
156
+ rating = int(alt_match.group(2))
157
+ gpt_score = (rating - 1) / 4.0
158
+ logger.info(f'Rating extracted (alt pattern): {rating}/5 -> {gpt_score}')
159
+ else:
160
+ # Last resort: standalone digit
161
+ number_match = re.search(r'(?<!\d)[1-5](?!\d)', judge_response)
162
+ if number_match:
163
+ rating = int(number_match.group(0))
164
+ gpt_score = (rating - 1) / 4.0
165
+ logger.info(f'Rating extracted (fallback): {rating}/5 -> {gpt_score}')
166
+ else:
167
+ gpt_score = 0.0
168
+ logger.warning('No rating found in response, using default 0.0')
169
+
170
+ score.value['gpt_score'] = gpt_score
171
+ score.explanation = f'LLM judge rating: {gpt_score:.2f}'
172
+
173
+ score.metadata = {
174
+ 'judge_response': judge_response[:300],
175
+ 'model': getattr(self.llm_judge, 'model_id', 'unknown')
176
+ }
177
+
178
+ score.main_score_name = 'gpt_score'
179
+ return score
180
+
181
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
182
+ """
183
+ Aggregate scores across all samples.
184
+ """
185
+ if not sample_scores:
186
+ return [
187
+ AggScore(metric_name='gpt_score', score=0.0, num=0, metadata={}),
188
+ AggScore(metric_name='bert_score', score=0.0, num=0, metadata={})
189
+ ]
190
+
191
+ # Extract scores
192
+ gpt_scores = [ss.score.value.get('gpt_score', 0.0) for ss in sample_scores]
193
+ bert_scores = [ss.score.value.get('bert_score', 0.0) for ss in sample_scores]
194
+
195
+ # Calculate averages
196
+ avg_gpt_score = sum(gpt_scores) / len(gpt_scores) if gpt_scores else 0.0
197
+ avg_bert_score = sum(bert_scores) / len(bert_scores) if bert_scores else 0.0
198
+
199
+ return [
200
+ AggScore(
201
+ metric_name='gpt_score',
202
+ score=avg_gpt_score,
203
+ num=len(sample_scores),
204
+ metadata={
205
+ 'min_score': min(gpt_scores),
206
+ 'max_score': max(gpt_scores)
207
+ }
208
+ ),
209
+ AggScore(
210
+ metric_name='bert_score',
211
+ score=avg_bert_score,
212
+ num=len(sample_scores),
213
+ metadata={
214
+ 'min_score': min(bert_scores),
215
+ 'max_score': max(bert_scores)
216
+ }
217
+ )
218
+ ]
@@ -41,7 +41,7 @@ Answer: 43
41
41
  description=
42
42
  'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
43
43
  dataset_id='AI-ModelScope/DROP',
44
- metric_list=['acc'],
44
+ metric_list=['em', 'f1'],
45
45
  few_shot_num=3,
46
46
  train_split=None,
47
47
  eval_split='validation',
@@ -54,11 +54,9 @@ class DROPAdapter(DefaultDataAdapter):
54
54
  def __init__(self, **kwargs):
55
55
  super().__init__(**kwargs)
56
56
 
57
- if self.few_shot_num != 0:
57
+ if self.few_shot_num != 0 and self.few_shot_num != 3:
58
58
  self.few_shot_num = 3
59
59
  logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
60
- else:
61
- self.few_shot_num = 0
62
60
 
63
61
  def record_to_sample(self, record: Dict[str, Any]) -> Sample:
64
62
  """
@@ -70,9 +68,10 @@ class DROPAdapter(DefaultDataAdapter):
70
68
  Returns:
71
69
  Sample: Sample object with input, target, and metadata.
72
70
  """
71
+ from .utils import _get_gold_answers
73
72
 
74
73
  # Parse gold answers
75
- gold_answers = self._get_gold_answers(record)
74
+ gold_answers = _get_gold_answers(record)
76
75
 
77
76
  return Sample(
78
77
  input=record['question'],
@@ -102,33 +101,6 @@ class DROPAdapter(DefaultDataAdapter):
102
101
  query=query,
103
102
  )
104
103
 
105
- def _get_gold_answers(self, input_d: dict) -> List[str]:
106
- """
107
- Parse the raw input labels (gold).
108
- """
109
-
110
- def _flatten_validated_answers(validated_answers):
111
- """Flattens a dict of lists of validated answers."""
112
- valid_answers = []
113
- for i in range(len(validated_answers['number'])):
114
- valid_answers.append({
115
- 'number': validated_answers['number'][i],
116
- 'date': validated_answers['date'][i],
117
- 'spans': validated_answers['spans'][i],
118
- })
119
- return valid_answers
120
-
121
- answers = []
122
- answers_set = set()
123
- candidates = [input_d['answer']] + _flatten_validated_answers(input_d['validated_answers'])
124
- for candidate in candidates:
125
- answer = DROPAdapter.parse_answer(candidate)
126
- if answer in answers_set:
127
- continue
128
- answers_set.add(answer)
129
- answers.append(answer)
130
- return answers
131
-
132
104
  def extract_answer(self, prediction: str, task_state: TaskState):
133
105
  """
134
106
  Extract the answer from the model prediction.
@@ -147,7 +119,9 @@ class DROPAdapter(DefaultDataAdapter):
147
119
  """
148
120
  Calculate accuracy score by matching prediction with reference answers.
149
121
  """
150
- from .utils import _answer_to_bags
122
+ import numpy as np
123
+
124
+ from .utils import _align_bags, _answer_to_bags
151
125
 
152
126
  score = Score(
153
127
  extracted_prediction=filtered_prediction,
@@ -155,6 +129,7 @@ class DROPAdapter(DefaultDataAdapter):
155
129
  )
156
130
 
157
131
  max_em = 0
132
+ max_f1 = 0
158
133
  reference = ast.literal_eval(reference) if isinstance(reference, str) else reference
159
134
  for gold_answer in reference:
160
135
  # Convert the answers to bags of answers
@@ -165,20 +140,16 @@ class DROPAdapter(DefaultDataAdapter):
165
140
  exact_match = 1.0
166
141
  else:
167
142
  exact_match = 0.0
143
+
144
+ f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
145
+ f1_score = np.mean(f1_per_bag)
146
+ f1_score = round(f1_score, 2)
168
147
  # Check if the answer is empty
169
148
  if gold_answer[0].strip():
170
149
  max_em = max(max_em, exact_match)
150
+ max_f1 = max(max_f1, f1_score)
171
151
 
172
- score.value = {'acc': max_em}
173
- score.main_score_name = 'acc'
152
+ score.value = {'em': max_em, 'f1': max_f1}
153
+ score.main_score_name = 'f1'
174
154
 
175
155
  return score
176
-
177
- @staticmethod
178
- def parse_answer(answer):
179
- # NOTE: Everything is returned as a tuple for uniformity and hashability.
180
- if answer['number'] != '':
181
- return (str(answer['number']), )
182
- if answer['spans'] != []:
183
- return tuple(answer['spans'])
184
- return (' '.join([answer['date']['day'], answer['date']['month'], answer['date']['year']]).strip(), )