evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,34 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
2
  evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
3
- evalscope/config.py,sha256=HGvIlhjVjA9QtAiNEUrx_hev3wa-RaNEXelEiLJn9OM,11015
3
+ evalscope/config.py,sha256=1YfHXlIyYH70FQfi8TiUtpUH3VIRCh5YcbaayKZo5s4,6781
4
4
  evalscope/constants.py,sha256=1CYghe0fGccyiVgzMIHd2HIb6lOo9fmB-8pH_l99iI4,4014
5
- evalscope/run.py,sha256=ss7ECL4dq18ur9qFOWqCNIsckXQWWl1EsVaJxDPBVq8,7000
6
- evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
- evalscope/summarizer.py,sha256=nZOaXfaSaXht8GAVik_Pvz2YL0Gv24UG45mMklyBkvA,5938
8
- evalscope/version.py,sha256=VHNGbQIK9g2FDZyk0Yk7RSDY_XsEEtvEBuN8kjAA8PM,119
5
+ evalscope/run.py,sha256=dL1deJ0J1RHW6X6ZStXzAVL7NwbjW6McfdOMkCpWrtc,7012
6
+ evalscope/summarizer.py,sha256=ZLFDHmi0Bgo18ouQsxuUl9vmIES9zkoapLLWRLhy19Q,5911
7
+ evalscope/version.py,sha256=IZr-isfEmPkZ2eTCGlS5vvkiE5fMlg3HeXpgVmjGGJY,119
9
8
  evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
10
- evalscope/app/app.py,sha256=QyO0RFfkLeOVzx-Mr8br3bYPwii2O_eVGmNgwCGHkac,29863
9
+ evalscope/app/app.py,sha256=8mSBp8qUCCmqupV4FEPMPdT9jL-bYu4DdH2qj8P0ktk,776
11
10
  evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
12
- evalscope/app/constants.py,sha256=KpItEl9lF0VldOm0grjS7RVbbseemtsXZJKtgGmAQB8,361
11
+ evalscope/app/constants.py,sha256=oG6tZ618zJcCnwZ5THnYL0gWTPDb5XKrnmdrWxY3Z4Q,385
12
+ evalscope/app/ui/__init__.py,sha256=IBxyQ2H-kSHoHJmXWDR8QMermvsMbiu673PQbXP_FnE,616
13
+ evalscope/app/ui/app_ui.py,sha256=FvpHsr4Lc0LAcwXIaVn9sUAAjO8QLNYCuojmKrjKvaE,2023
14
+ evalscope/app/ui/multi_model.py,sha256=7pe71PSaU7gnvogmCBMLUr_DUAgAeni12k4QcoHkFDs,15040
15
+ evalscope/app/ui/sidebar.py,sha256=JA0QbG2iPStK-lFy6x_AjOHlQdesmgXoS0OYJUJ_Wyg,1339
16
+ evalscope/app/ui/single_model.py,sha256=K5SU_S7WXWsbYLih2rQfRrVE50enzbCrq4rbhpo2uXo,9406
17
+ evalscope/app/ui/visualization.py,sha256=jXFX_-7woQkcAiQkPAIRwVv1kdRdXonn9IvmB8yzPDU,1102
18
+ evalscope/app/utils/data_utils.py,sha256=TMgiDu4MGvWgyd8G_nNOAOw39ZCRVFfRLLvrxCX_Ocw,6806
19
+ evalscope/app/utils/localization.py,sha256=rWEviBmcnhIpAA-cG8djbbUA6p1Y358c0dxge5Pqi1U,6131
20
+ evalscope/app/utils/text_utils.py,sha256=7DJow2W3Fna5Qny-AbwVRTWDh44ualONS5A5uUtesuk,3590
21
+ evalscope/app/utils/visualization.py,sha256=N9M7OV6lxcCvFtXmLBcUWw3RPlYZva7YH3rvhgTElqk,3522
13
22
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
23
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
15
24
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
16
25
  evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
17
- evalscope/backend/opencompass/backend_manager.py,sha256=kIPzirjAOW0_YNQiCrhjRfAVD3UpcGmr4RXBH-WMH0Y,10409
26
+ evalscope/backend/opencompass/backend_manager.py,sha256=POEYRmNlptoRYlTNcpRcHEXwqrYo34RW4TM_kf7wMQQ,10458
18
27
  evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
19
28
  evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
20
29
  evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
21
30
  evalscope/backend/rag_eval/__init__.py,sha256=Tbj7HboP5zzJ77-9qVEwwhHKjHL5V8MwLFr6sw1oeoA,291
22
- evalscope/backend/rag_eval/backend_manager.py,sha256=OEFADT8kdsuVMU0QOfiafzFQopY7bKbWZ_jhdXyYElY,3472
31
+ evalscope/backend/rag_eval/backend_manager.py,sha256=iEer5IhEJ8nOXW_s3j6l5jvfLgBftcGQMAtJk69Wzdc,3521
23
32
  evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
24
33
  evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
25
34
  evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=2OdPj4gSUWdAGCfS9PHpPGbd6q5RqEyli2G6UGb1ffw,8888
@@ -53,15 +62,16 @@ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8m
53
62
  evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
54
63
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
64
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
56
- evalscope/backend/rag_eval/utils/embedding.py,sha256=64DQrGzB2sw_Y0twwlSmOYobpOfgmRBFLfVMOc39UTk,9370
65
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=uqodHHvOKlza-bCLJ9Zkm8G1Jt2y2JT88jtIqCjA0sA,9379
57
66
  evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
58
67
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
59
68
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
60
- evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
61
- evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
69
+ evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
70
+ evalscope/benchmarks/__init__.py,sha256=NVd_VvmkY36LxdHNmgeogSBwMFfWoLJAZF8vDg-CoFc,1308
62
71
  evalscope/benchmarks/benchmark.py,sha256=uZ_-Y_wPhy6TxufWiElF4BwEWN93azT1JHtGRW8tR-w,2633
63
- evalscope/benchmarks/data_adapter.py,sha256=NgaKHfm288hVGeG1l_xGbLvB-Gno4M7Xd5Pa2ozY17Q,22975
64
- evalscope/benchmarks/utils.py,sha256=81MwUJYWjJgoiRClY-IFB-EZN0th-oQDTvU2ekaEmpc,1869
72
+ evalscope/benchmarks/data_adapter.py,sha256=t_IOA6hvPrF_mrAzwgS-HP1aRQ_sI-3s9oSpRxmtFLg,23475
73
+ evalscope/benchmarks/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
74
+ evalscope/benchmarks/utils.py,sha256=37Pn9SxRqF0WoLR7LcGJF9xASh4VxcUL93v03WHmnh8,1813
65
75
  evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
76
  evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
77
  evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
@@ -71,16 +81,16 @@ evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z
71
81
  evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
72
82
  evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
73
83
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
- evalscope/benchmarks/aime/aime24_adapter.py,sha256=hVoQMXpp_DSoZuJzCQLbAAUR8p4h9_1WcFUxelGUJBA,2036
75
- evalscope/benchmarks/aime/aime25_adapter.py,sha256=TJ2pivciL8LhffGP6lZPMBqaaTzuaCN_00Bz51E7QFI,2037
84
+ evalscope/benchmarks/aime/aime24_adapter.py,sha256=iwOvjB-hwUYFRNDTe8xuRCFxASh69gCzuU3Vz9qnsUs,2070
85
+ evalscope/benchmarks/aime/aime25_adapter.py,sha256=fNJXUSCxjGyvtX_gkp4bveC_oXHwr1VNQdUePAuwjIE,2071
76
86
  evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=AwrtuC_6o2Wa1zGnZ080OCuWv8S-hwvGHJqZ7KPQwoI,4328
87
+ evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=BLU3G7IB3gmIYiXtznzHjPIrvi65nYZwqSF7FFnP7Aw,4324
78
88
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
79
89
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
80
- evalscope/benchmarks/arc/arc_adapter.py,sha256=BG_VeTyN88oXu7qquhva2ou1I3-RePzXLxQCsY_ne2M,6682
90
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=OO2khZxfgsRzYk64zLvq4yEbgPdQuvbIVPO4t0E4Hcc,6703
81
91
  evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=FBwkxfnbyXgTiFmwKA5mjIOb_eOuUnXrijM4rrBHZE4,6672
83
- evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
92
+ evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=iJeIh-xiQbuc3E8ea48DTCfDW_KnlGMdTeIek5AlKnk,6668
93
+ evalscope/benchmarks/arena_hard/utils.py,sha256=kRgKXdVt4Ep3XGOzUQpf9JThnp1OOt8oUQhvQEtOzRY,4596
84
94
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
85
95
  evalscope/benchmarks/bbh/bbh_adapter.py,sha256=IFu9XctrLNJcIFXK4jV3LmyqQCVb66z8YhL07Osc1TA,8623
86
96
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
@@ -113,17 +123,17 @@ evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlf
113
123
  evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
124
  evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=MQPlfMvTQYHA4EP5g7eNzXDs4A4QvgYOiGC458Z39q4,10080
115
125
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
116
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=jZNOtaTwiyXAA6wQ8udXKyOo-f2mKOPjE6q7mrKCPXQ,11639
126
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=V_TC_E0lKXaFcV_qIdrg2_iddmGJ4um8iIdaXVaK_EM,11146
117
127
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
118
128
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
129
  evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=kaZ8fZK2a9oVwpGRUA3wz3FkxtcTY_FkRDYrdLjDNro,8433
120
130
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
121
131
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
122
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=QdeXKS8TdEua8lWWjoNOLvSB2fN3AKa7pKV0xjwmwME,10596
132
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=3oh79iFR006vnlpwjsRVO5cl6pOav00I5uU98DPCORM,10119
123
133
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
124
134
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
125
135
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
126
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=yubOKBm8IqskyuEYkbUDxdkUCmVJE1-yB5SxxMWyHjA,7004
136
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=ZBIZJZDSy-b9lTgm2-ZU2pEh053rveMwccI1fu6xpkc,7038
127
137
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
138
  evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=z_wbrA4yJoMwfg4TJkvEZB2aV5cPFcxCZ3JIj49F4Do,2604
129
139
  evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -135,10 +145,13 @@ evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46
135
145
  evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
136
146
  evalscope/benchmarks/frames/frames_adapter.py,sha256=xYvxGzqj_YPDSZYogP9TxUhOxvZFbud1S2SOvz1nlDU,3136
137
147
  evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
148
+ evalscope/benchmarks/general_arena/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
+ evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=j2aDzikz9obxvrR-damdvSCXR0rfjEo-OzX8vujj2N0,19887
150
+ evalscope/benchmarks/general_arena/utils.py,sha256=u0q4FNIOFka1_gC344OCvBXUz89Ah6M8asjIXbNSweM,7188
138
151
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=r2qLKe8esRe45t2CoYzDiZXlq0zO6jVR-iiqLvdmn7Y,5160
152
+ evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=B5zMiywH06NWOZsxAOwP-4aE3DbJB3Oyi9tlbM2BEHU,5181
140
153
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
141
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=NFeV3rPSfv7_imlEnCI3oSi7aSJGGX2JDqzgvyLVOFw,4861
154
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=qdiH_XoWy3JNrBaUxl9S0bH16k0gXcx8dexZQflH74o,5443
142
155
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
143
156
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
144
157
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=J6RfxpUT1l8Jj3vT_Vtsn1z8MKCg32XTlKn_eihCI50,5071
@@ -147,40 +160,40 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
147
160
  evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=IBMdsvQ1w45_raCiACTBm7DVHtOYfckv8x15_OXIwTI,10752
148
161
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
149
162
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
150
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=l4bHGYaU66ga9J09_QTrrqM9zrzA7mpwQ9Ul7Uy47ig,6176
163
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=kgHz-n8_93J8DdR7XBlzfM2KDRoKcvg80h6CCjWv_Xk,6191
151
164
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
152
165
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
153
166
  evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=O6muXpiBrQ9RGSglnl3gS0yO6BSkQtXASMR9yXUfhEE,5515
154
167
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=C7Zww11RGbPzlB7dy-mef-2uHOVXFTdLc5W48_PM5xM,2172
168
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=2oStqiTD4w2f2n0kbjcbg7GJQfKCsHFieokQcNndWb4,2041
156
169
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
157
170
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
158
171
  evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
159
172
  evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
160
173
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
- evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=ZwUWpVe5gkEC3l5wTo-XdePHiDjQbHDhX2W0WTS5mC4,2715
174
+ evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=kYXKiiFa_F5Gl3mIOAtKxXW5myi0VW_XGidbSjArd6M,2730
162
175
  evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
163
176
  evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=iqmVUMZmyRhzOOXXQ-NN9P1nGvvbzTjOSEp6djbN_rw,6503
164
177
  evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
165
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=IHqEjfK_2O5Tk1kvWJCOcnEGIVW8Ujes6aLVm5YnkEg,3789
178
+ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=a4Vz73V1q8A0tV_DeKnTJKVxnDWmXs84diaqSym8gLM,3550
166
179
  evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
167
180
  evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
168
181
  evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
169
182
  evalscope/benchmarks/live_code_bench/testing_util.py,sha256=abjlwp6HDayf88mMI_daOKm06nEOeNBaMkmGWqk2DJo,17286
170
183
  evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=WXpieeLsr_BRd48fSHswdKvO2uUGYNDNfB4FyReDW9o,3134
184
+ evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=aibJmtIJkpvWlyLBiiL7TCdjUGfW8pxkAU2KQEZDIPM,3149
172
185
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
- evalscope/benchmarks/math_500/math_500_adapter.py,sha256=qrfqXrSSBJ0JzkhMg_6_gZtK6eWyMtgr_WiFqtssQ9c,2290
186
+ evalscope/benchmarks/math_500/math_500_adapter.py,sha256=Oc9XnBgMAjEerYAk3GtY2TTKm1QH_UI896kUuW2_a5Y,2324
174
187
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
175
188
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
176
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=NlodlICpGVz9_MjRn-FfCMGIfmEPBBXgMtczcxuvRlc,12090
189
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=58z0BQDYq1TKTzFVDbVdpJVlBOv0pJtuAu7uS8gVwbA,12111
177
190
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
178
191
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=zAW3lvWXkGqYsPbVfMj5tc5EuDXLCGLFNPT8sLcKuO0,4539
192
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=uglOOZBZfQBIuJOG7iT4THk2LNcfHQoakxQDpS4jB1U,4554
180
193
  evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=8FRC1lQX-Pv5Tji1Lsp5Mr456JvtGT1lU9c3hVO25l4,9871
194
+ evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=fYtAVKKGGfzRnDlEzU7IULruj2vYzey9aWoyZBBeftc,9886
182
195
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
183
- evalscope/benchmarks/musr/musr_adapter.py,sha256=lh0UrE3yqWzmOw_ALkxJJ9AbBn11HlQMYHO39P1HAnE,2676
196
+ evalscope/benchmarks/musr/musr_adapter.py,sha256=YTRFGsVuogdYlZoylfD3ij4AbyYrvT4hpY7MueVfu6c,2691
184
197
  evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
185
198
  evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=AybH_Ka2B2WCh-EvwAsMPlCGzJ78dHBhe5sJ6nDgNK4,15691
186
199
  evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
@@ -189,7 +202,7 @@ evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcw
189
202
  evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ULuXG68ifTEc_ucH_cj0p5AGdbL-ahA7kcJ-AzYVmSM,3767
190
203
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
191
204
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
192
- evalscope/benchmarks/race/race_adapter.py,sha256=JjIGGthWbktrsBL68rE-hvVY9ZOwKrrZzJoIdBdNoWg,6614
205
+ evalscope/benchmarks/race/race_adapter.py,sha256=FW_FSUGq5Iyz2cTACdk3qOqDt2kXwtCpVB9FT_Bc6LM,6635
193
206
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
194
207
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
195
208
  evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=gQzrmslukHOJf-VBSnVKYddIg34EEOvQuGYTurQgBy0,9289
@@ -204,12 +217,12 @@ evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgES
204
217
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
205
218
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
206
219
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
207
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=xvgt3SQQ0g5qT_RkZ1YOoYPxDS_CZrBJbDIKQjF-xEo,5328
220
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=IT5l6cFzZQi2i68kp8rWBdXWxiDVd14MORgk-lusPBM,5516
208
221
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
209
222
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
210
223
  evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=xY4Kr-GzyyE_TWGlaKL5mo9qTaza0frWLy7EgIwlZn4,12958
211
224
  evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
212
- evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=WSJv4TDLISUy66e_PZEfjrIwsQOhgPXqeyA30nBwetM,2194
225
+ evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=GkmTsrlpU1IA-E7dJXmsHXyY9ivRbmbeVKxFmMwWtLc,2209
213
226
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
214
227
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
215
228
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
@@ -222,14 +235,12 @@ evalscope/collections/evaluator.py,sha256=RJ337S0sy8dsV25I2OAxeWgSx_HrmXTyuuHKSt
222
235
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
223
236
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
224
237
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
225
- evalscope/evaluator/evaluator.py,sha256=pQ85iNgnA9ME2b7UNH33uybcStjSQffJTh55ZFqwCNk,22115
226
- evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
227
- evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
228
- evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
229
- evalscope/metrics/__init__.py,sha256=g96dZSt3Dh56TdVbe4yDqcfmr9DoLqH-R2__3Qvorjk,1497
230
- evalscope/metrics/llm_judge.py,sha256=O2IaJpsBe1HqfCVnRYOt_PLWg6w85DYlYLU7yTq5idw,4384
238
+ evalscope/evaluator/evaluator.py,sha256=0vxXyNt1v2wjzUXwC_7nc_-3wmT18yQ-QKDA9DDVXpA,22441
239
+ evalscope/metrics/__init__.py,sha256=LlqXdOPiWTTAzxuKdUwTYO0KgN3Zh1zs18H2sM_5o5I,1709
240
+ evalscope/metrics/completion_parsers.py,sha256=LJnHD_ea7SLfRVXIAHuzeJx0mAgbpzmf3VYcQRL-CdA,8733
241
+ evalscope/metrics/llm_judge.py,sha256=9DCT4p9llNSVUc-K6inrHGpHVdReHlEz_kJQyRezvz8,8268
231
242
  evalscope/metrics/math_parser.py,sha256=JtOkj28XOtwoUACXOXLzCeRYz0rx0tBsQLQDU8cbC20,17311
232
- evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
243
+ evalscope/metrics/metrics.py,sha256=f-KFVBJi6hOm9K7dFJSPCQDe5opEOzeb0z1YvhkKXb0,13797
233
244
  evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
234
245
  evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
235
246
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
@@ -340,21 +351,21 @@ evalscope/models/local_model.py,sha256=UWsmZlWpT8JNGjijzZQKirvq4YywBkKOS9G-U2cux
340
351
  evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,6308
341
352
  evalscope/models/register.py,sha256=WiylzfL-vb6Bl3H3_RdIaBabVOAc9tiuhsQzYJDVzTg,1948
342
353
  evalscope/models/adapters/__init__.py,sha256=zmldx8yC_KTI8NDRcxNLyPzv19wc57UvOVvzwyuYnG4,647
343
- evalscope/models/adapters/base_adapter.py,sha256=z98FiFCZwNSmQElkB7ONwswvUQZxqrCikngZDg0Nn5w,3311
354
+ evalscope/models/adapters/base_adapter.py,sha256=TfINK84g4mqmHcnqvvHmk-MXRN2Pkan4yVlVd4j0nVY,3166
344
355
  evalscope/models/adapters/bfcl_adapter.py,sha256=KtreuJ21X1lcUGGhVgW3U62p3P65_oydMdBPtE5um-I,10332
345
356
  evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
346
357
  evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
347
358
  evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
348
- evalscope/models/adapters/server_adapter.py,sha256=tS-SurglnYYuAyXikR-550pE48KUVGpNoeZ8G_y47yA,9602
359
+ evalscope/models/adapters/server_adapter.py,sha256=4fyC7fM_L_hn1SDqqDJAvMjEvBHVbTNF7xZHrO9bnhI,9616
349
360
  evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
350
361
  evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
351
362
  evalscope/models/custom/custom_model.py,sha256=rBccFVpCIfTGt9cgXLcxeUWc7w1sTRtbTO5w5qqQIQE,1405
352
- evalscope/models/custom/dummy_model.py,sha256=aZg_OZ6yFNg2macxS5iCymIdFHODdQGH4OOwMXQe4SM,3113
363
+ evalscope/models/custom/dummy_model.py,sha256=WpfrS3kvwRRdyThx9baaJ5vodYYh29VGRKsGKMWFflI,3124
353
364
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
354
- evalscope/perf/arguments.py,sha256=uBKqT_s5aG3a295MxE2VIzs9_8XXxhenN2TdZbsYXEA,10865
365
+ evalscope/perf/arguments.py,sha256=4YNmgTl4c76dvcL1GqxHsfRrs5cx6pvsT-6ss7weRC4,10415
355
366
  evalscope/perf/benchmark.py,sha256=cjUpJ3SRnZVBs_H24yqLh4WG_hcCADrniLG1VsmByb8,7901
356
367
  evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
357
- evalscope/perf/main.py,sha256=yfJWGd2l4uU_qKW9bD6DzV0DK9XXuCJGLYjF_JWR22E,3394
368
+ evalscope/perf/main.py,sha256=UdtzFWG5M9VDeuM2EvD6pqRNw5EubRadU74K-PnCLpU,3400
358
369
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
359
370
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
360
371
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
@@ -372,38 +383,17 @@ evalscope/perf/plugin/datasets/openqa.py,sha256=4Pnx5duFJzoiTUfZCbcK7LO8f-skmcpY
372
383
  evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANBGCSgSExFbscLwSM_Gmk,2958
373
384
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
374
385
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
375
- evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
376
- evalscope/perf/utils/benchmark_util.py,sha256=EPKUDijue85b8KhSJoJKLh6comkTKRjq2yoEw4kxBho,7227
386
+ evalscope/perf/utils/analysis_result.py,sha256=aoT7JD2zAzBeuZUfncKhJ2odX_7KnymwOmNB1Upam2c,935
387
+ evalscope/perf/utils/benchmark_util.py,sha256=pv__38XjxkTqOfcREnod40WxeMJe4okDuVcYjyySDtg,7258
377
388
  evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
378
389
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
379
390
  evalscope/perf/utils/local_server.py,sha256=RL9rGd5tEniZ0aErhHcbVXMX22YmujfE11T3j37VL8k,4684
380
391
  evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
381
392
  evalscope/perf/utils/rich_display.py,sha256=xZzeryQbYM6Cv8g1ulK6OQUE2CalQ_KtFxiy7pioeEU,8127
382
- evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
383
- evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
384
- evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
385
- evalscope/registry/config/cfg_pairwise_baseline.yaml,sha256=d05pBiqOk1ejcdd9XE-opZ_ersyttAesF3Iwa2df8O8,3580
386
- evalscope/registry/config/cfg_single.yaml,sha256=zjsUC3zhU8z7JURaJiz7npkUbFpP82q1ycqUmObC-hc,3056
387
- evalscope/registry/data/question.jsonl,sha256=WQw5FXvFYerdfwPK1L4YwrWX-TApeAr2X4Zxjznq-oc,12885
388
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
389
- evalscope/registry/data/prompt_template/prompt_templates.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
390
- evalscope/registry/data/qa_browser/battle.jsonl,sha256=2MXcYoMItBmttQxSMh2Oa0x51xxqJaWEgSuERUx1O_0,1185590
391
- evalscope/registry/data/qa_browser/category_mapping.yaml,sha256=3r9nUIciW9205qbtOQF7aI_etM191cM3vlWU8ueG2Co,484
392
- evalscope/registry/tasks/arc.yaml,sha256=MghUuCmZPEwGqwYhA8ClRWHiSwC3kbHcKMRicQl9aqc,765
393
- evalscope/registry/tasks/bbh.yaml,sha256=GE3PpE8zw_SROj41LZ5bTm6ZXXZjYOorAdwBCTEePXM,604
394
- evalscope/registry/tasks/bbh_mini.yaml,sha256=8o9ZiWaCTkN2uTwiOhjBQuyKm7GUw6ZfUZxb2bkOmvs,678
395
- evalscope/registry/tasks/ceval.yaml,sha256=XDaszb7DROKk8nQDiklirTvDJwkOUJtIN_tcUFVvIJk,703
396
- evalscope/registry/tasks/ceval_mini.yaml,sha256=4aYW4c0IzgAXSs5dp4d8dJ0OHVp5sD4uiRjChjL1zZg,672
397
- evalscope/registry/tasks/cmmlu.yaml,sha256=yOgKl1jmfcAfTuUcIMmG5SQhkrbEHEyyP3YuCuIN3l0,703
398
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml,sha256=egdiM5oG7RSs0M-g8QNikwhJ9tZVgw5FiLy-rIYYHAA,737
399
- evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNiAhGEdUqL-8c,702
400
- evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
401
- evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
402
- evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
403
- evalscope/report/__init__.py,sha256=mLCgT7G-WPagQHOGz97AOdLQJjyikrswDiXA8d9Wr_Q,923
393
+ evalscope/report/__init__.py,sha256=DIoXbj0mjs1m2kEgFvIyqy4skDuoBu0UDVmTDa60Ymk,905
404
394
  evalscope/report/combinator.py,sha256=4ahUtTFPTNiSjamldX3IcLf33yKTJKs6ZsC4fsCafe8,4192
405
395
  evalscope/report/generator.py,sha256=oykmQROG-Bt8ttCH4RtvmGJ39HmDJMTU6gG26lg5LHE,4321
406
- evalscope/report/utils.py,sha256=A8_bo-97UKA7Ys5slZ4TydCno9p7-Y3rxLpOd8gmAjM,7685
396
+ evalscope/report/utils.py,sha256=taTSLvMKzAtJ9oha7pe0WF2UZZfEqPQgdj4urq7ZJIE,8298
407
397
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
408
398
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
409
399
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -439,40 +429,39 @@ evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP
439
429
  evalscope/third_party/toolbench_static/toolbench_static.py,sha256=xE__eXvSwHmmSh1tXNvyBo6MCO4mDlYTbIYl9OGEfNI,2120
440
430
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
441
431
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
442
- evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
443
- evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
432
+ evalscope/utils/__init__.py,sha256=OiVmYHVkC_d8s6Zp1I6p6oTyhCEGvN-I9E6uzn8dgF4,1940
433
+ evalscope/utils/argument_utils.py,sha256=D7qOH85wf7LKh_cJ2X51OEaL7CMaddydmHZkfoYpvLk,1952
444
434
  evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
445
- evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
446
435
  evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
447
- evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
448
- evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
449
- evalscope/utils/io_utils.py,sha256=atRCynX9dFcZGxCDip8HRpdzVkkTXCK6y4HzfiOEFU8,5615
436
+ evalscope/utils/import_utils.py,sha256=BSdp7RQSZu67129TBbtJvMWU0CfCFu864K31eiM3pr8,2975
437
+ evalscope/utils/io_utils.py,sha256=xvFvVu3Hy2HJFvemvREdFh6en7SNmfrsnikK-Mj-q6Q,6828
450
438
  evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
451
- evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
452
- evalscope/utils/utils.py,sha256=P5gmpINv5UQrwEMrFZKZjdJspsOdGjaBARfRSDVNOd0,11414
439
+ evalscope/utils/model_utils.py,sha256=F1_WBHvBehWqrTd6kPtKICeeYucaZn5H0Gc3cCplYB8,2329
453
440
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
454
441
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
442
+ tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
455
443
  tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
456
- tests/aigc/test_t2i.py,sha256=Dqug3rV7EIkj6uwBjgj5UMj8ZrpGSznSHfn2g8J_P3M,3860
444
+ tests/aigc/test_t2i.py,sha256=XtVknpwlVMb6FSw3_WMFxMq0gZX6iG-ffdSQkcW2Fzw,3856
457
445
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
458
- tests/cli/test_all.py,sha256=yo1ysDM90dI_kWxKKPOf-BsYneeRYRJa5uh6_7SDZ3Y,4332
459
- tests/cli/test_collection.py,sha256=jIGQNQO4msJE9w4Ms5qxtuhkHVukeLcHvBF2dzHCKCI,4207
460
- tests/cli/test_run.py,sha256=RoS9Qtlwsm0sGJdeCWZbBrVDfkZV3iKOB9UtkeM1KWs,18651
446
+ tests/cli/test_all.py,sha256=GmY6g-EQCb_RJY4R76MeF9pvgYyzQHBxwn7Y_9BMwns,5866
447
+ tests/cli/test_collection.py,sha256=bXWzccH822Y2B1Ed251U6TE8G_osI6MXYNxzmfv9kBI,4197
448
+ tests/cli/test_custom.py,sha256=0YE-TCAeaQMRVRFla_TIvTd8d0USvvsSeqvYAD3NDNg,8796
449
+ tests/cli/test_run.py,sha256=Al8-CZeoWZH-c-YIg6qUIKtSIfRdzlEBsgVsl-WMosk,17570
461
450
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
462
- tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
451
+ tests/perf/test_perf.py,sha256=u82U7QYkA5JvR-iw0f4MNpWuEQOYid2g9cQ11ma7NAU,4844
463
452
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
464
- tests/rag/test_clip_benchmark.py,sha256=uykLrRCfNR8aOiLJI0GdSL4mOys3q0LFHsA_Ur7xudc,2658
465
- tests/rag/test_mteb.py,sha256=38cDYpqf0ozvrWf36I7z_O_DmAUCbF9LX06us65xNXk,7209
466
- tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
453
+ tests/rag/test_clip_benchmark.py,sha256=13pcY3gYHNQh2KfEHCqtCSqiOcbngSJ1BlVZzI58JCE,2694
454
+ tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
455
+ tests/rag/test_ragas.py,sha256=5qozXvPFIb67T-igJv87ijlOgkPnqgkkBVXu6Ht4D0A,4554
467
456
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
468
- tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
469
- tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
470
- tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
457
+ tests/swift/test_run_swift_eval.py,sha256=YbIhYNoI4kAB-ox-OXAKUifLIXTFqP-xGZicrAgK_V0,5784
458
+ tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4VlwL03atI,4934
459
+ tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
471
460
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
472
- tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
473
- evalscope-0.16.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
474
- evalscope-0.16.2.dist-info/METADATA,sha256=e60FJsG6ufvawkoGbh8146wtVCE6AA0mb9cnhIDdaSE,36533
475
- evalscope-0.16.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
476
- evalscope-0.16.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
477
- evalscope-0.16.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
478
- evalscope-0.16.2.dist-info/RECORD,,
461
+ tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
462
+ evalscope-0.17.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
463
+ evalscope-0.17.0.dist-info/METADATA,sha256=EwH2JsfnfzkG-OSyvu-8lGjQ0aU6lsTuHMUwC_RDDTU,36893
464
+ evalscope-0.17.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
465
+ evalscope-0.17.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
466
+ evalscope-0.17.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
467
+ evalscope-0.17.0.dist-info/RECORD,,
tests/aigc/test_t2i.py CHANGED
@@ -8,8 +8,8 @@ import unittest
8
8
  from evalscope.config import TaskConfig
9
9
  from evalscope.constants import EvalType, JudgeStrategy, ModelTask, OutputType
10
10
  from evalscope.run import run_task
11
- from evalscope.utils import test_level_list
12
11
  from evalscope.utils.logger import get_logger
12
+ from tests.utils import test_level_list
13
13
 
14
14
  os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
15
15
 
tests/cli/test_all.py CHANGED
@@ -9,8 +9,8 @@ import unittest
9
9
  from evalscope.config import TaskConfig
10
10
  from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
11
  from evalscope.run import run_task
12
- from evalscope.utils import test_level_list
13
12
  from evalscope.utils.logger import get_logger
13
+ from tests.utils import test_level_list
14
14
 
15
15
  os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
16
16
 
@@ -127,7 +127,7 @@ dataset_args={
127
127
  'mmlu_redux':{
128
128
  'subset_list': ['abstract_algebra']
129
129
  },
130
- 'frames':{
130
+ 'docmath':{
131
131
  'subset_list': ['simpshort_testmini']
132
132
  },
133
133
  'bfcl_v3':{
@@ -165,3 +165,51 @@ class TestRun(unittest.TestCase):
165
165
  )
166
166
 
167
167
  run_task(task_cfg=task_cfg)
168
+
169
+
170
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
171
+ def test_ci_lite(self):
172
+ from evalscope.config import TaskConfig
173
+
174
+ task_cfg = TaskConfig(
175
+ model='qwen-plus',
176
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
177
+ api_key= env.get('DASHSCOPE_API_KEY'),
178
+ eval_type=EvalType.SERVICE,
179
+ datasets=[
180
+ 'general_mcq',
181
+ 'general_qa',
182
+ 'iquiz',
183
+ ],
184
+ dataset_args={
185
+ 'general_mcq': {
186
+ 'local_path': 'custom_eval/text/mcq',
187
+ 'subset_list': [
188
+ 'example'
189
+ ],
190
+ },
191
+ 'general_qa': {
192
+ 'local_path': 'custom_eval/text/qa',
193
+ 'subset_list': [
194
+ 'example'
195
+ ]
196
+ }
197
+ },
198
+ eval_batch_size=1,
199
+ limit=1,
200
+ stream=True,
201
+ generation_config={
202
+ 'temperature': 0,
203
+ 'n': 1,
204
+ 'max_tokens': 4096,
205
+ },
206
+ judge_worker_num=1,
207
+ judge_strategy=JudgeStrategy.AUTO,
208
+ judge_model_args={
209
+ 'model_id': 'qwen2.5-72b-instruct',
210
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
211
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
212
+ }
213
+ )
214
+
215
+ run_task(task_cfg=task_cfg)
@@ -5,7 +5,7 @@ import unittest
5
5
  from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
6
6
  from evalscope.constants import EvalType, JudgeStrategy
7
7
  from evalscope.utils.io_utils import dump_jsonl_data
8
- from evalscope.utils.utils import test_level_list
8
+ from tests.utils import test_level_list
9
9
 
10
10
 
11
11
  class TestCollection(unittest.TestCase):