evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -4,9 +4,16 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .combinator import gen_table, get_data_frame, get_report_list
7
+ from .combinator import (
8
+ gen_table,
9
+ get_data_frame,
10
+ get_report_list,
11
+ percentage_weighted_average_from_subsets,
12
+ unweighted_average_from_subsets,
13
+ weighted_average_from_subsets,
14
+ )
8
15
  from .generator import ReportGenerator
9
- from .report import Category, Report, ReportKey, Subset
16
+ from .report import Category, Metric, Report, ReportKey, Subset
10
17
 
11
18
  else:
12
19
  _import_structure = {
@@ -14,7 +21,9 @@ else:
14
21
  'gen_table',
15
22
  'get_data_frame',
16
23
  'get_report_list',
17
- 'gen_report_table',
24
+ 'weighted_average_from_subsets',
25
+ 'unweighted_average_from_subsets',
26
+ 'percentage_weighted_average_from_subsets',
18
27
  ],
19
28
  'generator': [
20
29
  'ReportGenerator',
@@ -24,6 +33,7 @@ else:
24
33
  'Report',
25
34
  'ReportKey',
26
35
  'Subset',
36
+ 'Metric',
27
37
  ],
28
38
  }
29
39
 
@@ -4,9 +4,9 @@ import glob
4
4
  import os
5
5
  import pandas as pd
6
6
  from tabulate import tabulate
7
- from typing import List, Tuple
7
+ from typing import Dict, List, Tuple, Union
8
8
 
9
- from evalscope.report.report import Report
9
+ from evalscope.report.report import Report, Subset
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -88,26 +88,97 @@ def gen_table(
88
88
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
89
89
 
90
90
 
91
- class ReportsRecorder:
92
- COMMON_DATASET_PATH = []
93
- CUSTOM_DATASET_PATH = []
91
+ def weighted_average_from_subsets(
92
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
93
+ ) -> Subset:
94
+ """Calculate weighted average for given subsets.
94
95
 
95
- def __init__(self, oss_url: str = '', endpoint: str = ''):
96
- pass
96
+ Args:
97
+ subset_names (List[str]): List of subset names to include in the average.
98
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
99
+ new_name (str): Name for the resulting Subset object.
100
+
101
+ Returns:
102
+ Subset: A new Subset object with weighted average score
103
+ """
104
+ total_score = 0
105
+ total_count = 0
106
+ for name in subset_names:
107
+ if name in subset_dict:
108
+ subset = subset_dict[name]
109
+ total_score += subset.score * subset.num
110
+ total_count += subset.num
111
+
112
+ weighted_avg = total_score / total_count if total_count > 0 else 0
113
+ return Subset(name=new_name, score=weighted_avg, num=total_count)
114
+
115
+
116
+ def unweighted_average_from_subsets(
117
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
118
+ ) -> Subset:
119
+ """Calculate unweighted average for given subsets.
120
+
121
+ Args:
122
+ subset_names (List[str]): List of subset names to include in the average.
123
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
124
+ new_name (str): Name for the resulting Subset object.
125
+
126
+ Returns:
127
+ Subset: A new Subset object with unweighted average score
128
+ """
129
+ scores = []
130
+ total_count = 0
131
+ for name in subset_names:
132
+ if name in subset_dict:
133
+ subset = subset_dict[name]
134
+ scores.append(subset.score)
135
+ total_count += subset.num
136
+
137
+ unweighted_avg = sum(scores) / len(scores) if scores else 0
138
+ return Subset(name=new_name, score=unweighted_avg, num=total_count)
139
+
140
+
141
+ def percentage_weighted_average_from_subsets(
142
+ subset_names: List[str], subset_dict: Dict[str, Subset], weights: List[float], new_name: str = ''
143
+ ) -> Subset:
144
+ """Calculate percentage weighted average for given subsets.
145
+
146
+ Args:
147
+ subset_names (List[str]): List of subset names to include in the average.
148
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
149
+ weights (List[float]): The weight for each corresponding accuracy entry.
150
+ Can sum to any positive value – they will be normalised internally.
151
+ new_name (str): Name for the resulting Subset object.
152
+
153
+ Returns:
154
+ Subset: A new Subset object with percentage weighted average score.
155
+ """
156
+ assert len(subset_names) == len(weights), \
157
+ 'The number of subset names must match the number of weights.'
158
+
159
+ valid_subsets = []
160
+ valid_weights = []
161
+ total_count = 0
162
+
163
+ for name, weight in zip(subset_names, weights):
164
+ if name in subset_dict:
165
+ subset = subset_dict[name]
166
+ valid_subsets.append(subset)
167
+ valid_weights.append(weight)
168
+ total_count += subset.num
169
+
170
+ if not valid_subsets:
171
+ return Subset(name=new_name, score=0, num=0)
97
172
 
173
+ weight_sum = sum(valid_weights)
174
+ assert weight_sum > 0, \
175
+ f"Sum of weights for percentage_weighted_average_from_subsets for '{new_name}' is not positive."
98
176
 
99
- if __name__ == '__main__':
100
- report_dir_1 = './outputs/20250117_151926'
101
- # report_dir_2 = './outputs/20250107_204445/reports'
177
+ # Normalise weights so that they sum to 1.0
178
+ weights_norm = [w / weight_sum for w in valid_weights]
102
179
 
103
- report_table = gen_table(reports_path_list=[report_dir_1])
104
- print(report_table)
180
+ total_score = 0
181
+ for subset, weight in zip(valid_subsets, weights_norm):
182
+ total_score += subset.score * weight
105
183
 
106
- # ALL VALUES ONLY FOR EXAMPLE
107
- # +--------------------------+-------------------+-------------+
108
- # | Model | CompetitionMath | GSM8K |
109
- # +==========================+===================+=============+
110
- # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
111
- # +--------------------------+-------------------+-------------+
112
- # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
113
- # +--------------------------+-------------------+-------------+
184
+ return Subset(name=new_name, score=total_score, num=total_count)
@@ -8,105 +8,26 @@ from evalscope.report.report import *
8
8
  if TYPE_CHECKING:
9
9
  from evalscope.api.benchmark import DataAdapter
10
10
  from evalscope.api.metric import AggScore
11
- from evalscope.benchmarks import DataAdapter as OldDataAdapter
12
11
 
13
12
 
14
13
  class ReportGenerator:
15
14
 
16
15
  @staticmethod
17
- def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'OldDataAdapter', **kwargs) -> Report:
18
- """
19
- Generate a report for a specific dataset based on provided subset scores.
20
-
21
- Args:
22
- subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
23
- {
24
- 'subset_name': [
25
- {'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
26
- {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
27
- ],
28
- ...
29
- }
30
- report_name (str): The name of the report to generate.
31
- data_adapter (DataAdapter): An adapter object for data handling.
32
-
33
- Returns:
34
- Report: A structured report object containing metrics, categories, and subsets.
35
-
36
- >>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
37
- """ # noqa: E501
38
-
39
- dataset_name = data_adapter.name
40
- category_map = data_adapter.category_map
41
- report_name = f'{model_name}@{dataset_name}'
42
-
43
- def flatten_subset() -> DataFrame:
44
- """
45
- Flatten subset score map to a DataFrame.
46
-
47
- Example:
48
- name score num categories metric_name
49
- 0 ARC-Easy 0.5 2 [default] AverageAccuracy
50
- 1 ARC-Challenge 0.5 2 [default] AverageAccuracy
51
- """
52
- subsets = []
53
- for subset_name, scores in subset_score_map.items():
54
- for score_item in scores:
55
- categories = category_map.get(subset_name, ['default'])
56
- if isinstance(categories, str):
57
- categories = [categories]
58
- subsets.append(
59
- dict(
60
- name=subset_name,
61
- score=score_item['score'],
62
- num=score_item['num'],
63
- metric_name=score_item['metric_name'],
64
- categories=tuple(categories)
65
- )
66
- )
67
- df = pd.DataFrame(subsets)
68
- return df
69
-
70
- df = flatten_subset()
71
-
16
+ def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
72
17
  metrics_list = []
73
- for metric_name, group_metric in df.groupby('metric_name', sort=False):
18
+ for metric_name, group_metric in df.groupby('metric', sort=False):
74
19
  categories = []
75
20
  for category_name, group_category in group_metric.groupby('categories'):
76
21
  subsets = []
77
- for _, row in group_category.iterrows():
78
- subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
79
-
22
+ for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name',
23
+ 'subset_name']):
24
+ avg_score = group_subset['score'].mean()
25
+ num = group_subset['score'].count()
26
+ subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
80
27
  categories.append(Category(name=category_name, subsets=subsets))
81
-
82
28
  metrics_list.append(Metric(name=metric_name, categories=categories))
83
-
84
- report = Report(
85
- name=report_name,
86
- metrics=metrics_list,
87
- dataset_name=dataset_name,
88
- model_name=model_name,
89
- dataset_description=data_adapter.description,
90
- dataset_pretty_name=data_adapter.pretty_name
91
- )
92
- return report
93
-
94
- @staticmethod
95
- def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
96
- categories = []
97
- for category_name, group_category in df.groupby('categories'):
98
- subsets = []
99
- for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
100
- avg_score = group_subset['score'].mean()
101
- num = group_subset['score'].count()
102
- subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
103
-
104
- categories.append(Category(name=category_name, subsets=subsets))
105
29
  return Report(
106
- name=DataCollection.NAME,
107
- metrics=[Metric(name='Average', categories=categories)],
108
- dataset_name=all_dataset_name,
109
- model_name=model_name
30
+ name=DataCollection.NAME, metrics=metrics_list, dataset_name=all_dataset_name, model_name=model_name
110
31
  )
111
32
 
112
33
  @staticmethod
@@ -22,7 +22,7 @@ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分
22
22
  """
23
23
 
24
24
 
25
- def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
25
+ def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
26
26
  """
27
27
  Normalize score.
28
28
 
@@ -37,9 +37,10 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
37
37
  score = round(score, keep_num)
38
38
  elif isinstance(score, dict):
39
39
  score = {k: round(v, keep_num) for k, v in score.items()}
40
+ elif isinstance(score, int):
41
+ score = float(score)
40
42
  else:
41
43
  logger.warning(f'Unknown score type: {type(score)}')
42
-
43
44
  return score
44
45
 
45
46
 
@@ -103,6 +104,7 @@ class ReportKey:
103
104
  subset_name = 'Subset'
104
105
  num = 'Num'
105
106
  score = 'Score'
107
+ overall_score = 'OVERALL'
106
108
 
107
109
 
108
110
  @dataclass
@@ -181,12 +183,14 @@ class Report:
181
183
  table[ReportKey.num].append(subset.num)
182
184
  table[ReportKey.score].append(subset.score)
183
185
  # add overall metric when there are multiple subsets
184
- if metric_count > 1 and add_overall_metric:
186
+ if metric_count > 1 and add_overall_metric and (
187
+ ReportKey.overall_score not in table[ReportKey.subset_name]
188
+ ):
185
189
  table[ReportKey.model_name].append(self.model_name)
186
190
  table[ReportKey.dataset_name].append(self.dataset_name)
187
191
  table[ReportKey.metric_name].append(metric.name)
188
192
  table[ReportKey.category_name].append(('-', ))
189
- table[ReportKey.subset_name].append('OVERALL')
193
+ table[ReportKey.subset_name].append(ReportKey.overall_score)
190
194
  table[ReportKey.num].append(metric.num)
191
195
  table[ReportKey.score].append(metric.score)
192
196
  # NOTE: only flatten metrics if needed, use the first metric by default
evalscope/run.py CHANGED
@@ -38,6 +38,7 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
38
38
  if task_cfg.eval_backend != EvalBackend.NATIVE:
39
39
  result = run_non_native_backend(task_cfg, outputs)
40
40
  else:
41
+ logger.info('Running with native backend')
41
42
  result = evaluate_model(task_cfg, outputs)
42
43
 
43
44
  logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
@@ -94,12 +95,15 @@ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> d
94
95
  def get_backend_manager_class(eval_backend: EvalBackend):
95
96
  """Get the backend manager class based on the evaluation backend."""
96
97
  if eval_backend == EvalBackend.OPEN_COMPASS:
98
+ logger.info('Using OpenCompassBackendManager')
97
99
  from evalscope.backend.opencompass import OpenCompassBackendManager
98
100
  return OpenCompassBackendManager
99
101
  elif eval_backend == EvalBackend.VLM_EVAL_KIT:
102
+ logger.info('Using VLMEvalKitBackendManager')
100
103
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
101
104
  return VLMEvalKitBackendManager
102
105
  elif eval_backend == EvalBackend.RAG_EVAL:
106
+ logger.info('Using RAGEvalBackendManager')
103
107
  from evalscope.backend.rag_eval import RAGEvalBackendManager
104
108
  return RAGEvalBackendManager
105
109
  elif eval_backend == EvalBackend.THIRD_PARTY:
@@ -131,8 +135,9 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
131
135
  )
132
136
  evaluators.append(evaluator)
133
137
 
134
- # Update task_config.dataset_args with benchmark metadata
135
- task_config.dataset_args[dataset_name] = benchmark.to_dict()
138
+ # Update task_config.dataset_args with benchmark metadata, except for DataCollection
139
+ if dataset_name != DataCollection.NAME:
140
+ task_config.dataset_args[dataset_name] = benchmark.to_dict()
136
141
 
137
142
  # dump task_cfg to outputs.configs_dir after creating evaluators
138
143
  task_config.dump_yaml(outputs.configs_dir)
@@ -149,17 +154,20 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
149
154
  logger.info(f'Overall report table: \n{report_table} \n')
150
155
  except Exception:
151
156
  logger.error('Failed to generate report table.')
152
-
153
157
  # Clean up
154
158
  if model is not None:
155
159
  import gc
156
- import torch
157
160
 
158
161
  del model
159
162
  del evaluators
160
- torch.cuda.empty_cache()
161
163
  gc.collect()
162
164
 
165
+ from evalscope.utils.import_utils import check_import
166
+ if check_import('torch', raise_warning=False):
167
+ import torch
168
+ if torch.cuda.is_available():
169
+ torch.cuda.empty_cache()
170
+
163
171
  return eval_results
164
172
 
165
173
 
@@ -1,9 +1,5 @@
1
- import os
2
1
  from dataclasses import dataclass
3
- from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
4
2
 
5
- # 设置GPU环境变量
6
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7
3
 
8
4
  @dataclass
9
5
  class SwiftInferArgs:
@@ -61,4 +61,4 @@ def parse_int_or_float(num):
61
61
  def get_supported_params(func):
62
62
  """Get the supported parameters of a function."""
63
63
  sig = signature(func)
64
- return list(sig.parameters.keys())
64
+ return set(sig.parameters.keys())
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import time
3
- import torch
4
3
  from contextlib import contextmanager
5
4
  from functools import partial
6
5
  from pydantic import BaseModel, Field
@@ -95,6 +94,7 @@ class TextCompletionResponse(BaseModel):
95
94
  class ChatService:
96
95
 
97
96
  def __init__(self, model_path, attn_implementation):
97
+ import torch
98
98
  from modelscope import AutoModelForCausalLM, AutoTokenizer
99
99
  from transformers import TextIteratorStreamer
100
100