evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,386 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- env = dotenv_values('.env')
5
-
6
- import unittest
7
- from unittest import TestCase
8
-
9
- from evalscope.config import TaskConfig
10
- from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
- from evalscope.run import run_task
12
- from evalscope.utils.logger import get_logger
13
-
14
- logger = get_logger()
15
-
16
-
17
- class TestBenchmark(TestCase):
18
- """Benchmark evaluation test cases."""
19
-
20
- def setUp(self):
21
- """Setup common test configuration."""
22
- self.base_config = {
23
- 'model': 'qwen-plus',
24
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
- 'api_key': env.get('DASHSCOPE_API_KEY'),
26
- 'eval_type': EvalType.SERVICE,
27
- 'eval_batch_size': 5,
28
- 'limit': 5,
29
- 'generation_config': {
30
- 'max_tokens': 4096,
31
- 'temperature': 0.0,
32
- 'seed': 42,
33
- 'parallel_tool_calls': True
34
- },
35
- 'judge_strategy': JudgeStrategy.AUTO,
36
- 'judge_worker_num': 5,
37
- 'judge_model_args': {
38
- 'model_id': 'qwen2.5-72b-instruct',
39
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
- 'api_key': env.get('DASHSCOPE_API_KEY'),
41
- 'generation_config': {
42
- 'temperature': 0.0,
43
- 'max_tokens': 4096,
44
- }
45
- },
46
- 'debug': True,
47
- }
48
-
49
- def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
50
- """Helper method to run test for a specific dataset."""
51
- config = self.base_config.copy()
52
- config['datasets'] = [dataset_name]
53
-
54
- if use_mock:
55
- config['eval_type'] = EvalType.MOCK_LLM
56
-
57
- # 应用配置覆盖
58
- config.update(config_overrides)
59
-
60
- if dataset_args:
61
- config['dataset_args'] = {dataset_name: dataset_args}
62
-
63
- task_cfg = TaskConfig(**config)
64
- run_task(task_cfg=task_cfg)
65
-
66
- def _run_dataset_load_test(self, dataset_name, dataset_args=None):
67
- """Helper method to test dataset loading."""
68
-
69
- self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
70
-
71
- # Math & Reasoning datasets
72
- def test_gsm8k(self):
73
- """Test GSM8K math reasoning dataset."""
74
- self._run_dataset_test('gsm8k')
75
-
76
- def test_gsm8k_local(self):
77
- """Test GSM8K math reasoning dataset with local path."""
78
- dataset_args = {
79
- 'local_path': 'data/gsm8k',
80
- }
81
- self._run_dataset_test('gsm8k', dataset_args=dataset_args, use_mock=True)
82
-
83
- def test_mmlu(self):
84
- """Test MMLU reasoning dataset."""
85
- dataset_args = {
86
- 'few_shot_num': 0,
87
- # 'subset_list': ['abstract_algebra', 'computer_security']
88
- }
89
- self._run_dataset_test('mmlu', use_mock=True, dataset_args=dataset_args)
90
-
91
- def test_mmlu_pro(self):
92
- """Test MMLU-Pro reasoning dataset."""
93
- dataset_args = {
94
- 'few_shot_num': 2,
95
- 'subset_list': ['computer science', 'math']
96
- }
97
- self._run_dataset_test('mmlu_pro', use_mock=False, dataset_args=dataset_args, repeats=2)
98
-
99
- def test_mmlu_redux(self):
100
- """Test MMLU-Redux reasoning dataset."""
101
- dataset_args = {
102
- 'subset_list': ['abstract_algebra', 'computer_security'],
103
- }
104
- # self._run_dataset_load_test('mmlu_redux', dataset_args)
105
- self._run_dataset_test('mmlu_redux', dataset_args=dataset_args)
106
-
107
- def test_cmmlu(self):
108
- """Test C-MMLU reasoning dataset."""
109
- dataset_args = {
110
- 'subset_list': ['agronomy', 'computer_security'],
111
- 'few_shot_num': 0,
112
- }
113
- # self._run_dataset_load_test('cmmlu')
114
- self._run_dataset_test('cmmlu', dataset_args=dataset_args)
115
-
116
- def test_math_500(self):
117
- """Test MATH 500 dataset."""
118
- # self._run_dataset_load_test('math_500')
119
- self._run_dataset_test('math_500')
120
-
121
- def test_aime24(self):
122
- """Test AIME 2024 dataset."""
123
- self._run_dataset_test('aime24')
124
-
125
- def test_aime25(self):
126
- """Test AIME 2025 dataset."""
127
- self._run_dataset_test('aime25')
128
-
129
- def test_competition_math(self):
130
- """Test Competition Math dataset."""
131
- dataset_args = {
132
- 'subset_list': ['Level 4']
133
- }
134
- self._run_dataset_test('competition_math', dataset_args)
135
-
136
- # Knowledge & QA datasets
137
- def test_arc(self):
138
- """Test ARC dataset."""
139
- # self._run_dataset_load_test('arc')
140
- dataset_args = {
141
- 'subset_list': ['ARC-Easy', 'ARC-Challenge'],
142
- 'few_shot_num': 2,
143
- }
144
- self._run_dataset_test('arc', dataset_args=dataset_args)
145
-
146
- def test_ceval(self):
147
- """Test CEval dataset."""
148
- dataset_args = {
149
- 'subset_list': ['logic', 'law'],
150
- # 'few_shot_num': 0,
151
- }
152
- # self._run_dataset_load_test('ceval')
153
- self._run_dataset_test('ceval', dataset_args=dataset_args)
154
-
155
- def test_super_gpqa(self):
156
- """Test Super GPQA dataset."""
157
- # self._run_dataset_load_test('super_gpqa')
158
-
159
- dataset_args = {
160
- 'subset_list': ['History', 'Psychology'],
161
- 'few_shot_num': 0,
162
- }
163
- self._run_dataset_test('super_gpqa', dataset_args=dataset_args, ignore_errors=True)
164
-
165
- def test_gpqa(self):
166
- """Test GPQA dataset."""
167
- # self._run_dataset_load_test('gpqa_diamond')
168
- dataset_args = {
169
- 'few_shot_num': 0,
170
- }
171
- self._run_dataset_test('gpqa_diamond', dataset_args=dataset_args, ignore_errors=True)
172
-
173
- def test_iquiz(self):
174
- """Test IQuiz dataset."""
175
- dataset_args = {
176
- 'subset_list': ['IQ', 'EQ'],
177
- 'few_shot_num': 0,
178
- }
179
- self._run_dataset_test('iquiz', dataset_args=dataset_args)
180
-
181
- def test_maritime_bench(self):
182
- """Test MaritimeBench dataset."""
183
- dataset_args = {
184
- 'subset_list': ['default'],
185
- 'few_shot_num': 0,
186
- }
187
- self._run_dataset_test('maritime_bench', dataset_args=dataset_args)
188
-
189
- def test_musr(self):
190
- """Test MuSR dataset."""
191
- dataset_args = {
192
- 'subset_list': ['murder_mysteries', 'object_placements', 'team_allocation'],
193
- 'few_shot_num': 0,
194
- }
195
- self._run_dataset_test('musr', dataset_args=dataset_args)
196
-
197
- def test_hellaswag(self):
198
- """Test HellaSwag dataset."""
199
- self._run_dataset_test('hellaswag')
200
-
201
- def test_truthful_qa(self):
202
- """Test TruthfulQA dataset."""
203
- dataset_args = {
204
- 'extra_params': {
205
- 'multiple_correct': True
206
- }
207
- }
208
- self._run_dataset_test('truthful_qa', dataset_args=dataset_args)
209
-
210
- def test_trivia_qa(self):
211
- """Test TriviaQA dataset."""
212
- self._run_dataset_test('trivia_qa')
213
-
214
- def test_race(self):
215
- """Test RACE dataset."""
216
- self._run_dataset_test('race')
217
-
218
- def test_winogrande(self):
219
- """Test winogrande"""
220
- self._run_dataset_test('winogrande')
221
-
222
- def test_bbh(self):
223
- dataset_args = {
224
- 'subset_list': ['temporal_sequences', 'navigate'],
225
- }
226
- self._run_dataset_test('bbh', dataset_args=dataset_args)
227
-
228
- def test_simple_qa(self):
229
- """Test SimpleQA dataset."""
230
- self._run_dataset_test('simple_qa')
231
-
232
- def test_chinese_simpleqa(self):
233
- """Test Chinese SimpleQA dataset."""
234
- dataset_args = {
235
- 'subset_list': ['中华文化']
236
- }
237
- self._run_dataset_test('chinese_simpleqa', dataset_args)
238
-
239
- # Code datasets
240
- def test_live_code_bench(self):
241
- """Test LiveCodeBench dataset."""
242
- dataset_args = {
243
- 'extra_params': {
244
- 'start_date': '2024-08-01',
245
- 'end_date': '2025-02-28'
246
- },
247
- 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
248
- }
249
- self._run_dataset_test('live_code_bench', dataset_args)
250
-
251
- def test_humaneval(self):
252
- """Test HumanEval dataset."""
253
- self._run_dataset_test('humaneval')
254
-
255
- # Custom & specialized datasets
256
- def test_general_qa(self):
257
- """Test custom general QA dataset."""
258
- dataset_args = {
259
- 'local_path': 'custom_eval/text/qa',
260
- 'subset_list': ['example']
261
- }
262
- self._run_dataset_test('general_qa', dataset_args)
263
-
264
- def test_general_mcq(self):
265
- """Test custom general MCQ dataset."""
266
- dataset_args = {
267
- 'local_path': 'custom_eval/text/mcq',
268
- 'subset_list': ['example']
269
- }
270
- self._run_dataset_test('general_mcq', dataset_args)
271
-
272
- def test_alpaca_eval(self):
273
- """Test AlpacaEval dataset."""
274
- self._run_dataset_test('alpaca_eval')
275
-
276
- def test_arena_hard(self):
277
- """Test Arena Hard dataset."""
278
- self._run_dataset_test('arena_hard', use_cache='outputs/20250818_211353')
279
-
280
- def test_frames(self):
281
- """Test Frames dataset."""
282
- dataset_args = {
283
- # 'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
284
- }
285
- self._run_dataset_test('frames', dataset_args)
286
-
287
- def test_docmath(self):
288
- """Test DocMath dataset."""
289
- self._run_dataset_test('docmath')
290
-
291
- def test_drop(self):
292
- """Test DROP dataset."""
293
- dataset_args = {
294
- 'few_shot_num': 3,
295
- }
296
- self._run_dataset_test('drop', dataset_args=dataset_args)
297
-
298
- def test_ifeval(self):
299
- """Test IFEval dataset."""
300
- self._run_dataset_test('ifeval')
301
-
302
- def test_needle_haystack(self):
303
- """Test Needle in Haystack dataset."""
304
- dataset_args = {
305
- 'subset_list': ['english'],
306
- 'extra_params': {
307
- 'context_lengths_max': 10000,
308
- 'context_lengths_num_intervals': 5,
309
- 'document_depth_percent_intervals': 5,
310
- 'show_score': True,
311
- }
312
- }
313
- self._run_dataset_test('needle_haystack', dataset_args)
314
-
315
- def test_ifeval(self):
316
- """Test IFEval dataset."""
317
- self._run_dataset_test('ifeval')
318
-
319
- def test_hle(self):
320
- """Test HLE dataset."""
321
- dataset_args = {
322
- 'subset_list': ['Math', 'Other'],
323
- 'extra_params': {
324
- 'include_multi_modal': False
325
- }
326
- }
327
- self._run_dataset_test('hle', dataset_args)
328
-
329
- def test_process_bench(self):
330
- """Test ProcessBench dataset."""
331
- dataset_args = {
332
- 'subset_list': ['gsm8k', 'math'],
333
- }
334
- self._run_dataset_test('process_bench', dataset_args, use_cache='outputs/20250819_161844')
335
-
336
- def test_humaneval(self):
337
- """Test HumanEval dataset."""
338
- dataset_args = {
339
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5']
340
- }
341
- self._run_dataset_test('humaneval', dataset_args, repeats=5)
342
-
343
- def test_live_code_bench(self):
344
- """Test LiveCodeBench dataset."""
345
- dataset_args = {
346
- 'subset_list': ['v6'],
347
- 'extra_params': {
348
- 'start_date': '2024-08-01',
349
- 'end_date': '2025-02-28'
350
- },
351
- }
352
- self._run_dataset_test('live_code_bench', dataset_args, judge_worker_num=1)
353
-
354
- def test_tool_bench(self):
355
- """Test ToolBench dataset."""
356
- self._run_dataset_test('tool_bench')
357
-
358
- def test_bfcl(self):
359
- """Test BFCL dataset."""
360
- dataset_args = {
361
- 'subset_list': ['simple', 'live_multiple', 'multi_turn_base'],
362
- 'extra_params': {
363
- 'is_fc_model': True,
364
- 'underscore_to_dot': True
365
- }
366
- }
367
- self._run_dataset_test('bfcl_v3', dataset_args)
368
-
369
- def test_tau_bench(self):
370
- dataset_args = {
371
- 'extra_params': {
372
- 'user_model': 'qwen-plus',
373
- 'api_key': env.get('DASHSCOPE_API_KEY'),
374
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
375
- 'generation_config': {
376
- 'temperature': 0.7,
377
- 'max_new_tokens': 1024
378
- }
379
- }
380
- }
381
- self._run_dataset_test('tau_bench', dataset_args, limit=1)
382
-
383
- if __name__ == '__main__':
384
- # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
385
- # Run all tests: python -m unittest test_eval.TestBenchmark
386
- unittest.main()
tests/cli/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
tests/cli/test_all.py DELETED
@@ -1,229 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- env = dotenv_values('.env')
5
-
6
- import os
7
- import unittest
8
-
9
- from evalscope.config import TaskConfig
10
- from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
- from evalscope.run import run_task
12
- from evalscope.utils.logger import get_logger
13
- from tests.utils import test_level_list
14
-
15
- os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
16
-
17
- logger = get_logger()
18
-
19
- datasets=[
20
- 'iquiz',
21
- 'ifeval',
22
- 'mmlu',
23
- 'mmlu_pro',
24
- 'musr',
25
- 'process_bench',
26
- 'race',
27
- 'trivia_qa',
28
- 'cmmlu',
29
- 'humaneval',
30
- 'gsm8k',
31
- 'bbh',
32
- 'competition_math',
33
- 'math_500',
34
- 'aime24',
35
- 'gpqa_diamond',
36
- 'arc',
37
- 'ceval',
38
- 'hellaswag',
39
- 'general_mcq',
40
- 'general_qa',
41
- 'super_gpqa',
42
- # 'live_code_bench',
43
- 'mmlu_redux',
44
- 'simple_qa',
45
- 'chinese_simpleqa',
46
- 'alpaca_eval',
47
- 'arena_hard',
48
- 'maritime_bench',
49
- 'drop',
50
- 'winogrande',
51
- 'tool_bench',
52
- 'frames',
53
- 'docmath',
54
- 'needle_haystack',
55
- 'bfcl_v3',
56
- 'hle',
57
- 'tau_bench',
58
- ]
59
-
60
- # Reverse the datasets list to ensure the order is from most recent to oldest
61
- datasets.reverse()
62
-
63
- dataset_args={
64
- 'mmlu': {
65
- 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
66
- 'few_shot_num': 0
67
- },
68
- 'mmlu_pro': {
69
- 'subset_list': ['math', 'health'],
70
- 'few_shot_num': 4
71
- },
72
- 'ceval': {
73
- 'subset_list': [
74
- 'computer_network', 'operating_system', 'computer_architecture'
75
- ],
76
- 'few_shot_num': 0
77
- },
78
- 'cmmlu': {
79
- 'subset_list': ['elementary_chinese'],
80
- 'few_shot_num': 0
81
- },
82
- 'bbh': {
83
- 'subset_list': ['word_sorting', 'movie_recommendation'],
84
- },
85
- 'gpqa_diamond': {
86
- 'few_shot_num': 0,
87
- },
88
- 'humaneval': {
89
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
90
- },
91
- 'competition_math': {
92
- 'subset_list': ['Level 1']
93
- },
94
- 'math_500': {
95
- 'subset_list': ['Level 1']
96
- },
97
- 'process_bench': {
98
- 'subset_list': ['gsm8k'],
99
- },
100
- 'musr': {
101
- 'subset_list': ['murder_mysteries']
102
- },
103
- 'general_mcq': {
104
- 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
105
- 'subset_list': [
106
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
107
- ],
108
- },
109
- 'general_qa': {
110
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
111
- 'subset_list': [
112
- 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
113
- # 'test'
114
- ]
115
- },
116
- 'super_gpqa': {
117
- 'subset_list': ['Philosophy', 'Education'],
118
- 'few_shot_num': 0
119
- },
120
- 'live_code_bench': {
121
- 'subset_list': ['v4_v5'],
122
- 'extra_params': {
123
- 'start_date': '2024-12-01',
124
- 'end_date': '2025-01-01'
125
- },
126
- },
127
- 'chinese_simpleqa': {
128
- 'subset_list': ['中华文化']
129
- },
130
- 'mmlu_redux':{
131
- 'subset_list': ['abstract_algebra']
132
- },
133
- 'docmath':{
134
- 'subset_list': ['simpshort_testmini']
135
- },
136
- 'bfcl_v3':{
137
- 'subset_list': ['simple', 'multiple']
138
- },
139
- 'hle': {
140
- 'subset_list': ['Math', 'Other'],
141
- },
142
- 'tau_bench': {
143
- 'extra_params': {
144
- 'user_model': 'qwen-plus',
145
- 'api_key': env.get('DASHSCOPE_API_KEY'),
146
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
147
- },
148
- 'subset_list': ['airline'],
149
- },
150
- }
151
-
152
- class TestRun(unittest.TestCase):
153
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
154
- def test_benchmarks(self):
155
- from evalscope.config import TaskConfig
156
-
157
- task_cfg = TaskConfig(
158
- model='qwen-plus',
159
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
160
- api_key= env.get('DASHSCOPE_API_KEY'),
161
- eval_type=EvalType.SERVICE,
162
- datasets=datasets,
163
- dataset_args=dataset_args,
164
- eval_batch_size=1,
165
- limit=1,
166
- stream=True,
167
- generation_config={
168
- 'temperature': 0,
169
- 'n': 1,
170
- 'max_tokens': 4096,
171
- },
172
- judge_worker_num=5,
173
- judge_strategy=JudgeStrategy.AUTO,
174
- judge_model_args={
175
- 'model_id': 'qwen2.5-72b-instruct',
176
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
177
- 'api_key': env.get('DASHSCOPE_API_KEY'),
178
- }
179
- )
180
-
181
- run_task(task_cfg=task_cfg)
182
-
183
-
184
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
185
- def test_ci_lite(self):
186
- from evalscope.config import TaskConfig
187
-
188
- task_cfg = TaskConfig(
189
- model='qwen-plus',
190
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
191
- api_key= env.get('DASHSCOPE_API_KEY'),
192
- eval_type=EvalType.SERVICE,
193
- datasets=[
194
- 'general_mcq',
195
- 'general_qa',
196
- 'iquiz',
197
- ],
198
- dataset_args={
199
- 'general_mcq': {
200
- 'local_path': 'custom_eval/text/mcq',
201
- 'subset_list': [
202
- 'example'
203
- ],
204
- },
205
- 'general_qa': {
206
- 'local_path': 'custom_eval/text/qa',
207
- 'subset_list': [
208
- 'example'
209
- ]
210
- }
211
- },
212
- eval_batch_size=1,
213
- limit=1,
214
- stream=True,
215
- generation_config={
216
- 'temperature': 0,
217
- 'n': 1,
218
- 'max_tokens': 4096,
219
- },
220
- judge_worker_num=1,
221
- judge_strategy=JudgeStrategy.AUTO,
222
- judge_model_args={
223
- 'model_id': 'qwen2.5-72b-instruct',
224
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
225
- 'api_key': env.get('DASHSCOPE_API_KEY'),
226
- }
227
- )
228
-
229
- run_task(task_cfg=task_cfg)