evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,146 @@
1
+ import os
2
+ from collections import defaultdict
3
+ from typing import Dict, List
4
+
5
+ from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.dataset.dataset import DatasetDict
8
+ from evalscope.api.dataset.loader import DictDataLoader
9
+ from evalscope.api.messages.chat_message import ChatMessageUser
10
+ from evalscope.api.metric import Score
11
+ from evalscope.api.model import Model, ModelOutput
12
+ from evalscope.api.registry import register_benchmark
13
+ from evalscope.constants import Tags
14
+ from evalscope.utils import get_logger
15
+ from evalscope.utils.function_utils import run_once
16
+ from evalscope.utils.import_utils import check_import
17
+
18
+ logger = get_logger()
19
+
20
+
21
+ @register_benchmark(
22
+ BenchmarkMeta(
23
+ name='tau2_bench',
24
+ pretty_name='τ²-bench',
25
+ tags=[Tags.FUNCTION_CALLING, Tags.REASONING, Tags.AGENT],
26
+ description='τ²-bench (Tau Squared Bench) is an extension and enhancement of the original '
27
+ 'τ-bench (Tau Bench), which is a benchmark designed to evaluate conversational AI agents '
28
+ 'that interact with users through domain-specific API tools and guidelines. '
29
+ 'Please install it with `pip install git+https://github.com/sierra-research/tau2-bench@v0.2.0` '
30
+ 'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/tau2_bench.html)', # noqa: E501
31
+ dataset_id='evalscope/tau2-bench-data',
32
+ subset_list=['airline', 'retail', 'telecom'],
33
+ aggregation='mean_and_pass_hat_k',
34
+ eval_split='test',
35
+ extra_params={
36
+ 'user_model': 'qwen-plus',
37
+ 'api_key': 'EMPTY',
38
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
39
+ 'generation_config': {
40
+ 'temperature': 0.0,
41
+ 'max_tokens': 4096,
42
+ }
43
+ }
44
+ )
45
+ )
46
+ class Tau2BenchAdapter(AgentAdapter):
47
+
48
+ def __init__(self, **kwargs):
49
+ super().__init__(**kwargs)
50
+
51
+ check_import(
52
+ 'tau2',
53
+ package='git+https://github.com/sierra-research/tau2-bench@v0.2.0',
54
+ raise_error=True,
55
+ feature_name=self.pretty_name
56
+ )
57
+
58
+ # setup user model args
59
+ self.user_model = self.extra_params.get('user_model', 'qwen-plus')
60
+ self.api_key = self.extra_params.get('api_key', 'EMPTY')
61
+ self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
62
+ self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
63
+
64
+ def load(self):
65
+ # Load dataset
66
+ dataset_name_or_path = self.dataset_id
67
+ if os.path.exists(dataset_name_or_path):
68
+ logger.info(f'Loading dataset from {dataset_name_or_path}')
69
+ dataset_path = dataset_name_or_path
70
+ else:
71
+ from modelscope import dataset_snapshot_download
72
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
73
+ dataset_path = dataset_snapshot_download(dataset_name_or_path)
74
+
75
+ # Set Tau2 data dir
76
+ os.environ['TAU2_DATA_DIR'] = dataset_path
77
+
78
+ # Load data for each domain
79
+ from tau2.agent.llm_agent import LLMGTAgent
80
+ from tau2.registry import registry
81
+
82
+ data_dict = defaultdict(dict)
83
+ for domain_name in self.subset_list:
84
+ logger.info(f'Loading Tau2-Bench environment: {domain_name}')
85
+ # Get tasks
86
+ task_loader = registry.get_tasks_loader(domain_name)
87
+ tasks = task_loader()
88
+ tasks = [task for task in tasks if LLMGTAgent.check_valid_task(task)]
89
+ tasks = [task.model_dump(exclude_unset=True) for task in tasks]
90
+
91
+ # load dataset
92
+ dataset = DictDataLoader(
93
+ dict_list=tasks,
94
+ sample_fields=self.record_to_sample,
95
+ limit=self.limit,
96
+ repeats=self.repeats,
97
+ shuffle=self.shuffle,
98
+ ).load()
99
+
100
+ data_dict[domain_name] = dataset
101
+
102
+ test_dataset = DatasetDict(data_dict)
103
+
104
+ return test_dataset, None
105
+
106
+ def record_to_sample(self, record: Dict) -> Sample:
107
+ """Convert a data record to a Sample object."""
108
+ return Sample(
109
+ input=[ChatMessageUser(content=record['description']['purpose'] or '')],
110
+ target='', # Will use the record for evaluation
111
+ subset_key=record['user_scenario']['instructions']['domain'],
112
+ metadata=record # Store the full record for evaluation
113
+ )
114
+
115
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
116
+ from .generation import predict
117
+ return predict(model, sample, adapter_instance=self)
118
+
119
+ def match_score(self, original_prediction: str, filtered_prediction: str, reference: str, task_state) -> Score:
120
+
121
+ score = Score(
122
+ extracted_prediction=filtered_prediction,
123
+ prediction=original_prediction,
124
+ )
125
+
126
+ try:
127
+ # Parse the prediction to get the reward
128
+ task_result = task_state.metadata['task_result']
129
+ reward = task_result['reward']
130
+
131
+ score.value = {
132
+ 'acc': float(reward),
133
+ }
134
+ score.explanation = f'Task completed with reward: {reward}'
135
+ score.metadata = {
136
+ 'task_result': task_result,
137
+ }
138
+ score.main_score_name = 'acc'
139
+
140
+ except Exception as e:
141
+ score.value = {'acc': 0.0}
142
+ score.explanation = f'Evaluation failed: {str(e)}'
143
+ score.metadata = {'error': str(e)}
144
+ score.main_score_name = 'acc'
145
+
146
+ return score
File without changes
@@ -45,7 +45,7 @@ def _patch_agent_solve(model: Model):
45
45
  input=[dict_to_chat_message(msg) for msg in messages],
46
46
  tools=[ToolInfo.model_validate(tool['function']) for tool in self.tools_info]
47
47
  )
48
- oai_res = openai_chat_choices(res.choices)
48
+ oai_res = openai_chat_choices(res.choices, include_reasoning=False)
49
49
 
50
50
  next_message = oai_res[0].message.model_dump(exclude_none=True)
51
51
 
@@ -1,8 +1,7 @@
1
- import importlib
2
1
  from collections import defaultdict
3
2
  from typing import Dict, List
4
3
 
5
- from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
6
5
  from evalscope.api.dataset import Sample
7
6
  from evalscope.api.dataset.dataset import DatasetDict
8
7
  from evalscope.api.dataset.loader import DictDataLoader
@@ -13,6 +12,7 @@ from evalscope.api.registry import register_benchmark
13
12
  from evalscope.constants import Tags
14
13
  from evalscope.utils import get_logger
15
14
  from evalscope.utils.function_utils import run_once
15
+ from evalscope.utils.import_utils import check_import
16
16
 
17
17
  logger = get_logger()
18
18
 
@@ -21,47 +21,43 @@ logger = get_logger()
21
21
  BenchmarkMeta(
22
22
  name='tau_bench',
23
23
  pretty_name='τ-bench',
24
- tags=[Tags.FUNCTION_CALLING, Tags.REASONING],
24
+ tags=[Tags.FUNCTION_CALLING, Tags.REASONING, Tags.AGENT],
25
25
  description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
26
26
  'and a language agent provided with domain-specific API tools and policy guidelines. '
27
27
  'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` '
28
- 'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/tau_bench.html)', # noqa: E501
28
+ 'before evaluating and set a user model. [Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/tau_bench.html)', # noqa: E501
29
29
  dataset_id='https://github.com/sierra-research/tau-bench',
30
30
  subset_list=['airline', 'retail'],
31
- metric_list=['Pass^1'],
31
+ aggregation='mean_and_pass_hat_k',
32
32
  eval_split='test',
33
33
  extra_params={
34
34
  'user_model': 'qwen-plus',
35
35
  'api_key': 'EMPTY',
36
36
  'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
37
37
  'generation_config': {
38
- 'temperature': 0.7,
39
- 'max_new_tokens': 1024
38
+ 'temperature': 0.0,
39
+ 'max_tokens': 4096,
40
40
  }
41
41
  }
42
42
  )
43
43
  )
44
- class TauBenchAdapter(DefaultDataAdapter):
44
+ class TauBenchAdapter(AgentAdapter):
45
45
 
46
46
  def __init__(self, **kwargs):
47
47
  super().__init__(**kwargs)
48
48
 
49
- spec = importlib.util.find_spec('tau_bench')
50
- if spec is None:
51
- raise ImportError(
52
- '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.' # noqa: E501
53
- )
49
+ check_import(
50
+ 'tau_bench',
51
+ package='git+https://github.com/sierra-research/tau-bench',
52
+ raise_error=True,
53
+ feature_name=self.pretty_name
54
+ )
54
55
 
55
56
  # setup user model args
56
57
  self.user_model = self.extra_params.get('user_model', 'qwen-plus')
57
58
  self.api_key = self.extra_params.get('api_key', 'EMPTY')
58
59
  self.api_base = self.extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
59
- self.generation_config = self.extra_params.get(
60
- 'generation_config', {
61
- 'temperature': 0.7,
62
- 'max_new_tokens': 1024
63
- }
64
- )
60
+ self.generation_config = self.extra_params.get('generation_config', {'temperature': 0.0, 'max_tokens': 4096})
65
61
 
66
62
  self._patch_env_completion()
67
63
 
@@ -84,10 +80,10 @@ class TauBenchAdapter(DefaultDataAdapter):
84
80
 
85
81
  res = user_server.generate(input=[dict_to_chat_message(msg) for msg in messages])
86
82
 
87
- message = res.message.model_dump(exclude_none=True)
83
+ message = {'role': 'assistant', 'content': res.completion}
88
84
  self.messages.append(message)
89
85
  self.total_cost = 0
90
- return message['content']
86
+ return res.completion
91
87
 
92
88
  # get the current instance of TauBenchAdapter
93
89
  adapter_instance = self
@@ -114,7 +110,11 @@ class TauBenchAdapter(DefaultDataAdapter):
114
110
  })
115
111
  # load dataset
116
112
  dataset = DictDataLoader(
117
- dict_list=tasks, sample_fields=self.record_to_sample, limit=self.limit, repeats=self.repeats
113
+ dict_list=tasks,
114
+ sample_fields=self.record_to_sample,
115
+ limit=self.limit,
116
+ repeats=self.repeats,
117
+ shuffle=self.shuffle,
118
118
  ).load()
119
119
 
120
120
  data_dict[env_name] = dataset
@@ -145,24 +145,24 @@ class TauBenchAdapter(DefaultDataAdapter):
145
145
 
146
146
  try:
147
147
  # Parse the prediction to get the reward
148
- res = task_state.metadata
149
- reward = res.get('reward', 0.0)
148
+ task_result = task_state.metadata['task_result']
149
+ reward = task_result.get('reward', 0.0)
150
150
 
151
151
  score.value = {
152
- 'Pass^1': float(reward),
152
+ 'acc': float(reward),
153
153
  }
154
154
  score.explanation = f'Task completed with reward: {reward}'
155
155
  score.metadata = {
156
- 'task_result': res,
156
+ 'task_result': task_result,
157
157
  'env_name': task_state.metadata.get('env_name', 'unknown'),
158
158
  'task_index': task_state.metadata.get('task_index', -1)
159
159
  }
160
- score.main_score_name = 'Pass^1'
160
+ score.main_score_name = 'acc'
161
161
 
162
162
  except Exception as e:
163
- score.value = {'Pass^1': 0.0}
163
+ score.value = {'acc': 0.0}
164
164
  score.explanation = f'Evaluation failed: {str(e)}'
165
165
  score.metadata = {'error': str(e)}
166
- score.main_score_name = 'Pass^1'
166
+ score.main_score_name = 'acc'
167
167
 
168
168
  return score
File without changes
@@ -16,8 +16,10 @@ logger = get_logger()
16
16
  @register_benchmark(
17
17
  BenchmarkMeta(
18
18
  name='evalmuse',
19
+ pretty_name='EvalMuse',
19
20
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
20
- description='EvalMuse Text-to-Image Benchmark',
21
+ description='EvalMuse Text-to-Image Benchmark. Used for evaluating the quality '
22
+ 'and semantic alignment of finely generated images',
21
23
  tags=[Tags.TEXT_TO_IMAGE],
22
24
  subset_list=['EvalMuse'],
23
25
  metric_list=['FGA_BLIP2Score'],
@@ -4,7 +4,6 @@ import os
4
4
  from evalscope.api.benchmark import BenchmarkMeta, Text2ImageAdapter
5
5
  from evalscope.api.dataset import Sample
6
6
  from evalscope.api.messages import ChatMessageUser
7
- from evalscope.api.metric.scorer import Score
8
7
  from evalscope.api.registry import get_metric, register_benchmark
9
8
  from evalscope.constants import Tags
10
9
  from evalscope.utils.logger import get_logger
@@ -15,8 +14,9 @@ logger = get_logger()
15
14
  @register_benchmark(
16
15
  BenchmarkMeta(
17
16
  name='genai_bench',
17
+ pretty_name='GenAI-Bench',
18
18
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
- description='GenAI-Bench Text-to-Image Benchmark',
19
+ description='GenAI-Bench Text-to-Image Benchmark. Includes 1600 prompts for text-to-image task.',
20
20
  tags=[Tags.TEXT_TO_IMAGE],
21
21
  subset_list=['GenAI-Bench-1600'],
22
22
  metric_list=['VQAScore'],
@@ -16,7 +16,7 @@ logger = get_logger()
16
16
  name='general_t2i',
17
17
  dataset_id='general_t2i',
18
18
  description='General Text-to-Image Benchmark',
19
- tags=[Tags.TEXT_TO_IMAGE],
19
+ tags=[Tags.TEXT_TO_IMAGE, Tags.CUSTOM],
20
20
  subset_list=['default'],
21
21
  metric_list=['PickScore'],
22
22
  few_shot_num=0,
@@ -14,8 +14,10 @@ logger = get_logger()
14
14
  @register_benchmark(
15
15
  BenchmarkMeta(
16
16
  name='hpdv2',
17
+ pretty_name='HPD-v2',
17
18
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
- description='HPDv2 Text-to-Image Benchmark',
19
+ description='HPDv2 Text-to-Image Benchmark. Evaluation metrics based on human preferences, '
20
+ 'trained on the Human Preference Dataset (HPD v2)',
19
21
  tags=[Tags.TEXT_TO_IMAGE],
20
22
  subset_list=['HPDv2'],
21
23
  metric_list=['HPSv2.1Score'],
@@ -41,7 +43,10 @@ class HPDv2Adapter(Text2ImageAdapter):
41
43
  return Sample(
42
44
  input=[ChatMessageUser(content=record['prompt'])],
43
45
  metadata={
46
+ 'id': record['id'],
47
+ 'prompt': record['prompt'],
44
48
  'category': record.get('tags', {}).get('category', ''),
45
- 'tags': record.get('tags', {})
49
+ 'tags': record.get('tags', {}),
50
+ 'image_path': record.get('image_path', ''), # Optional field for existing image path
46
51
  }
47
52
  )
@@ -10,6 +10,7 @@ logger = get_logger()
10
10
  @register_benchmark(
11
11
  BenchmarkMeta(
12
12
  name='tifa160',
13
+ pretty_name='TIFA-160',
13
14
  dataset_id='AI-ModelScope/T2V-Eval-Prompts',
14
15
  description='TIFA-160 Text-to-Image Benchmark',
15
16
  tags=[Tags.TEXT_TO_IMAGE],
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  from typing import Any, Dict
3
3
 
4
- from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
5
5
  from evalscope.api.dataset import Sample
6
6
  from evalscope.api.evaluator import TaskState
7
7
  from evalscope.api.messages.chat_message import ChatMessage, dict_to_chat_message
@@ -21,14 +21,14 @@ logger = get_logger()
21
21
  description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
22
22
  'It includes various subsets such as in-domain and out-of-domain, '
23
23
  'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
24
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)',
24
+ '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html)',
25
25
  dataset_id='AI-ModelScope/ToolBench-Static',
26
26
  subset_list=['in_domain', 'out_of_domain'],
27
27
  metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
28
28
  eval_split='test',
29
29
  )
30
30
  )
31
- class ToolBenchAdapter(DefaultDataAdapter):
31
+ class ToolBenchAdapter(AgentAdapter):
32
32
  """
33
33
  ToolBench adapter using the new data processing framework.
34
34
  """
@@ -37,6 +37,7 @@ TRUTHFUL_QA_PROMPT = (
37
37
  dataset_id='evalscope/truthful_qa',
38
38
  metric_list=['multi_choice_acc'],
39
39
  subset_list=['multiple_choice'],
40
+ shuffle_choices=True,
40
41
  few_shot_num=0,
41
42
  train_split=None,
42
43
  eval_split='validation',
@@ -55,8 +56,6 @@ class TruthfulQaAdapter(MultiChoiceAdapter):
55
56
 
56
57
  super().__init__(**kwargs)
57
58
 
58
- self.shuffle_choices = True
59
-
60
59
  self.multiple_correct = self.extra_params.get('multiple_correct', False)
61
60
  if self.multiple_correct:
62
61
  self.prompt_template = MultipleChoiceTemplate.MULTIPLE_ANSWER
File without changes
@@ -0,0 +1,75 @@
1
+ # flake8: noqa: E501
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+ from evalscope.utils.multi_choices import parse_answers
13
+
14
+ logger = get_logger()
15
+
16
+ MULT_CHOICE_PROMPT = """
17
+ Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
18
+
19
+ {question}
20
+ """
21
+
22
+ SUBSET_LIST = [
23
+ 'Quantitative Reasoning', 'Other', 'Positional Reasoning', 'Stylistic Reasoning', 'Spatial Reasoning',
24
+ 'Attribute Reasoning'
25
+ ]
26
+
27
+
28
+ @register_benchmark(
29
+ BenchmarkMeta(
30
+ name='visulogic',
31
+ pretty_name='VisuLogic',
32
+ dataset_id='evalscope/VisuLogic',
33
+ tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
34
+ description=
35
+ 'VisuLogic is a benchmark aimed at evaluating the visual reasoning capabilities of Multi-modal Large Language Models (MLLMs), independent of textual reasoning processes. It features carefully constructed visual reasoning tasks spanning multiple categories, divided into six types based on required reasoning skills (e.g., Quantitative Reasoning, which involves understanding and deducing changes in the quantity of elements in images). Unlike existing benchmarks, VisuLogic is a challenging visual reasoning benchmark that is inherently difficult to articulate using language, providing a more rigorous evaluation of the visual reasoning capabilities of MLLMs.',
36
+ subset_list=SUBSET_LIST,
37
+ metric_list=['acc'],
38
+ eval_split='test',
39
+ prompt_template=MULT_CHOICE_PROMPT,
40
+ )
41
+ )
42
+ class VisuLogicAdapter(VisionLanguageAdapter):
43
+
44
+ def __init__(self, **kwargs):
45
+ super().__init__(**kwargs)
46
+ self.reformat_subset = True
47
+
48
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
49
+ question = record.get('question', '')
50
+ content_list: List[Content] = []
51
+ prompt_text = self.prompt_template.format(question=question).strip()
52
+ content_list.append(ContentText(text=prompt_text))
53
+
54
+ image = record.get('image')
55
+ if image and isinstance(image, dict):
56
+ image_bytes = image.get('bytes')
57
+ if image_bytes:
58
+ image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
59
+ content_list.append(ContentImage(image=image_base64))
60
+
61
+ metadata = {
62
+ 'id': record['id'],
63
+ }
64
+
65
+ return Sample(
66
+ input=[ChatMessageUser(content=content_list)],
67
+ target=record['label'],
68
+ choices=['A', 'B', 'C', 'D'],
69
+ subset_key=record['tag'],
70
+ metadata=metadata,
71
+ )
72
+
73
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
74
+ answers = parse_answers(task_state)
75
+ return ''.join(sorted(list(answers)))
File without changes