evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  25. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  26. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  27. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  28. evalscope/benchmarks/data_adapter.py +29 -9
  29. evalscope/benchmarks/general_arena/__init__.py +0 -0
  30. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  31. evalscope/benchmarks/general_arena/utils.py +226 -0
  32. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  33. evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  34. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  35. evalscope/benchmarks/hle/__init__.py +0 -0
  36. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  49. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  50. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  51. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  52. evalscope/benchmarks/utils.py +2 -2
  53. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  54. evalscope/config.py +8 -123
  55. evalscope/constants.py +5 -21
  56. evalscope/evaluator/__init__.py +1 -1
  57. evalscope/evaluator/evaluator.py +20 -15
  58. evalscope/metrics/__init__.py +9 -1
  59. evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
  60. evalscope/metrics/llm_judge.py +106 -20
  61. evalscope/metrics/metrics.py +20 -8
  62. evalscope/models/__init__.py +4 -8
  63. evalscope/models/adapters/__init__.py +4 -9
  64. evalscope/models/adapters/base_adapter.py +4 -0
  65. evalscope/models/adapters/bfcl_adapter.py +2 -0
  66. evalscope/models/adapters/chat_adapter.py +3 -0
  67. evalscope/models/adapters/choice_adapter.py +4 -0
  68. evalscope/models/adapters/custom_adapter.py +7 -3
  69. evalscope/models/adapters/server_adapter.py +4 -2
  70. evalscope/models/adapters/t2i_adapter.py +3 -0
  71. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  72. evalscope/models/custom/dummy_model.py +3 -3
  73. evalscope/models/register.py +0 -14
  74. evalscope/perf/arguments.py +15 -16
  75. evalscope/perf/benchmark.py +38 -39
  76. evalscope/perf/http_client.py +30 -86
  77. evalscope/perf/main.py +3 -3
  78. evalscope/perf/plugin/__init__.py +3 -2
  79. evalscope/perf/plugin/api/__init__.py +4 -3
  80. evalscope/perf/plugin/api/base.py +22 -4
  81. evalscope/perf/plugin/api/custom_api.py +212 -55
  82. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  83. evalscope/perf/plugin/api/default_api.py +105 -0
  84. evalscope/perf/plugin/api/openai_api.py +17 -19
  85. evalscope/perf/plugin/datasets/__init__.py +10 -7
  86. evalscope/perf/plugin/datasets/base.py +22 -1
  87. evalscope/perf/plugin/datasets/custom.py +2 -1
  88. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  89. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  90. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  91. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  92. evalscope/perf/plugin/datasets/openqa.py +2 -1
  93. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  94. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  95. evalscope/perf/plugin/registry.py +36 -16
  96. evalscope/perf/utils/analysis_result.py +24 -23
  97. evalscope/perf/utils/benchmark_util.py +14 -20
  98. evalscope/perf/utils/db_util.py +79 -61
  99. evalscope/report/__init__.py +1 -1
  100. evalscope/report/utils.py +34 -15
  101. evalscope/run.py +1 -1
  102. evalscope/summarizer.py +1 -2
  103. evalscope/utils/__init__.py +63 -2
  104. evalscope/utils/argument_utils.py +64 -0
  105. evalscope/utils/import_utils.py +16 -0
  106. evalscope/utils/io_utils.py +55 -4
  107. evalscope/utils/model_utils.py +37 -1
  108. evalscope/version.py +2 -2
  109. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
  110. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
  111. tests/aigc/test_t2i.py +1 -1
  112. tests/cli/test_all.py +68 -4
  113. tests/cli/test_collection.py +1 -1
  114. tests/cli/test_custom.py +261 -0
  115. tests/cli/test_run.py +34 -70
  116. tests/perf/test_perf.py +31 -4
  117. tests/rag/test_clip_benchmark.py +2 -1
  118. tests/rag/test_mteb.py +3 -1
  119. tests/rag/test_ragas.py +3 -1
  120. tests/swift/test_run_swift_eval.py +2 -1
  121. tests/swift/test_run_swift_vlm_eval.py +2 -1
  122. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  123. tests/utils.py +13 -0
  124. tests/vlm/test_vlmeval.py +8 -2
  125. evalscope/evaluator/rating_eval.py +0 -157
  126. evalscope/evaluator/reviewer/__init__.py +0 -1
  127. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  128. evalscope/models/model.py +0 -189
  129. evalscope/registry/__init__.py +0 -1
  130. evalscope/registry/config/cfg_arena.yaml +0 -77
  131. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  132. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  133. evalscope/registry/config/cfg_single.yaml +0 -78
  134. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  135. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  136. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  137. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  138. evalscope/registry/data/question.jsonl +0 -80
  139. evalscope/registry/tasks/arc.yaml +0 -28
  140. evalscope/registry/tasks/bbh.yaml +0 -26
  141. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  142. evalscope/registry/tasks/ceval.yaml +0 -27
  143. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  144. evalscope/registry/tasks/cmmlu.yaml +0 -27
  145. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  146. evalscope/registry/tasks/general_qa.yaml +0 -27
  147. evalscope/registry/tasks/gsm8k.yaml +0 -29
  148. evalscope/registry/tasks/mmlu.yaml +0 -29
  149. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  150. evalscope/run_arena.py +0 -202
  151. evalscope/utils/arena_utils.py +0 -217
  152. evalscope/utils/completion_parsers.py +0 -82
  153. /evalscope/{utils → benchmarks}/filters.py +0 -0
  154. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  155. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  156. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  157. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,34 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
2
  evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
3
- evalscope/config.py,sha256=HGvIlhjVjA9QtAiNEUrx_hev3wa-RaNEXelEiLJn9OM,11015
4
- evalscope/constants.py,sha256=1CYghe0fGccyiVgzMIHd2HIb6lOo9fmB-8pH_l99iI4,4014
5
- evalscope/run.py,sha256=ss7ECL4dq18ur9qFOWqCNIsckXQWWl1EsVaJxDPBVq8,7000
6
- evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
- evalscope/summarizer.py,sha256=nZOaXfaSaXht8GAVik_Pvz2YL0Gv24UG45mMklyBkvA,5938
8
- evalscope/version.py,sha256=zdIprYl0JT22Bpk3tCmkfp1QLsi0mvROpxeTcgkCNHI,119
3
+ evalscope/config.py,sha256=1YfHXlIyYH70FQfi8TiUtpUH3VIRCh5YcbaayKZo5s4,6781
4
+ evalscope/constants.py,sha256=Tc74W89SxeeEzISDzO5IoxSo9A_F0LqjH0mOrcAYJXc,3737
5
+ evalscope/run.py,sha256=dL1deJ0J1RHW6X6ZStXzAVL7NwbjW6McfdOMkCpWrtc,7012
6
+ evalscope/summarizer.py,sha256=ZLFDHmi0Bgo18ouQsxuUl9vmIES9zkoapLLWRLhy19Q,5911
7
+ evalscope/version.py,sha256=wsTu-_Fq9Dmfg7bXg6eDVtNwZA5ui-MZ6IPs4EhytAc,119
9
8
  evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
10
- evalscope/app/app.py,sha256=QyO0RFfkLeOVzx-Mr8br3bYPwii2O_eVGmNgwCGHkac,29863
9
+ evalscope/app/app.py,sha256=8mSBp8qUCCmqupV4FEPMPdT9jL-bYu4DdH2qj8P0ktk,776
11
10
  evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
12
- evalscope/app/constants.py,sha256=KpItEl9lF0VldOm0grjS7RVbbseemtsXZJKtgGmAQB8,361
11
+ evalscope/app/constants.py,sha256=oG6tZ618zJcCnwZ5THnYL0gWTPDb5XKrnmdrWxY3Z4Q,385
12
+ evalscope/app/ui/__init__.py,sha256=IBxyQ2H-kSHoHJmXWDR8QMermvsMbiu673PQbXP_FnE,616
13
+ evalscope/app/ui/app_ui.py,sha256=FvpHsr4Lc0LAcwXIaVn9sUAAjO8QLNYCuojmKrjKvaE,2023
14
+ evalscope/app/ui/multi_model.py,sha256=7pe71PSaU7gnvogmCBMLUr_DUAgAeni12k4QcoHkFDs,15040
15
+ evalscope/app/ui/sidebar.py,sha256=JA0QbG2iPStK-lFy6x_AjOHlQdesmgXoS0OYJUJ_Wyg,1339
16
+ evalscope/app/ui/single_model.py,sha256=K5SU_S7WXWsbYLih2rQfRrVE50enzbCrq4rbhpo2uXo,9406
17
+ evalscope/app/ui/visualization.py,sha256=jXFX_-7woQkcAiQkPAIRwVv1kdRdXonn9IvmB8yzPDU,1102
18
+ evalscope/app/utils/data_utils.py,sha256=TMgiDu4MGvWgyd8G_nNOAOw39ZCRVFfRLLvrxCX_Ocw,6806
19
+ evalscope/app/utils/localization.py,sha256=rWEviBmcnhIpAA-cG8djbbUA6p1Y358c0dxge5Pqi1U,6131
20
+ evalscope/app/utils/text_utils.py,sha256=7DJow2W3Fna5Qny-AbwVRTWDh44ualONS5A5uUtesuk,3590
21
+ evalscope/app/utils/visualization.py,sha256=N9M7OV6lxcCvFtXmLBcUWw3RPlYZva7YH3rvhgTElqk,3522
13
22
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
23
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
15
24
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
16
25
  evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
17
- evalscope/backend/opencompass/backend_manager.py,sha256=kIPzirjAOW0_YNQiCrhjRfAVD3UpcGmr4RXBH-WMH0Y,10409
26
+ evalscope/backend/opencompass/backend_manager.py,sha256=POEYRmNlptoRYlTNcpRcHEXwqrYo34RW4TM_kf7wMQQ,10458
18
27
  evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
19
28
  evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
20
29
  evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
21
30
  evalscope/backend/rag_eval/__init__.py,sha256=Tbj7HboP5zzJ77-9qVEwwhHKjHL5V8MwLFr6sw1oeoA,291
22
- evalscope/backend/rag_eval/backend_manager.py,sha256=OEFADT8kdsuVMU0QOfiafzFQopY7bKbWZ_jhdXyYElY,3472
31
+ evalscope/backend/rag_eval/backend_manager.py,sha256=iEer5IhEJ8nOXW_s3j6l5jvfLgBftcGQMAtJk69Wzdc,3521
23
32
  evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
24
33
  evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
25
34
  evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=2OdPj4gSUWdAGCfS9PHpPGbd6q5RqEyli2G6UGb1ffw,8888
@@ -53,15 +62,16 @@ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8m
53
62
  evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
54
63
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
64
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
56
- evalscope/backend/rag_eval/utils/embedding.py,sha256=64DQrGzB2sw_Y0twwlSmOYobpOfgmRBFLfVMOc39UTk,9370
65
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=uqodHHvOKlza-bCLJ9Zkm8G1Jt2y2JT88jtIqCjA0sA,9379
57
66
  evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
58
67
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
59
68
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
60
- evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
61
- evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
69
+ evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
70
+ evalscope/benchmarks/__init__.py,sha256=NVd_VvmkY36LxdHNmgeogSBwMFfWoLJAZF8vDg-CoFc,1308
62
71
  evalscope/benchmarks/benchmark.py,sha256=uZ_-Y_wPhy6TxufWiElF4BwEWN93azT1JHtGRW8tR-w,2633
63
- evalscope/benchmarks/data_adapter.py,sha256=NgaKHfm288hVGeG1l_xGbLvB-Gno4M7Xd5Pa2ozY17Q,22975
64
- evalscope/benchmarks/utils.py,sha256=81MwUJYWjJgoiRClY-IFB-EZN0th-oQDTvU2ekaEmpc,1869
72
+ evalscope/benchmarks/data_adapter.py,sha256=UI4HpnJNYo18GXRiU0HwNUxjRfoSXlCB-xEBIGs2ckg,23914
73
+ evalscope/benchmarks/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
74
+ evalscope/benchmarks/utils.py,sha256=mIk8n6zVMICQ5JWMyEwUqwlkxva4L-oD5SZzpIKw1sI,1851
65
75
  evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
76
  evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
77
  evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
@@ -71,16 +81,16 @@ evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z
71
81
  evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
72
82
  evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
73
83
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
- evalscope/benchmarks/aime/aime24_adapter.py,sha256=hVoQMXpp_DSoZuJzCQLbAAUR8p4h9_1WcFUxelGUJBA,2036
75
- evalscope/benchmarks/aime/aime25_adapter.py,sha256=TJ2pivciL8LhffGP6lZPMBqaaTzuaCN_00Bz51E7QFI,2037
84
+ evalscope/benchmarks/aime/aime24_adapter.py,sha256=iwOvjB-hwUYFRNDTe8xuRCFxASh69gCzuU3Vz9qnsUs,2070
85
+ evalscope/benchmarks/aime/aime25_adapter.py,sha256=fNJXUSCxjGyvtX_gkp4bveC_oXHwr1VNQdUePAuwjIE,2071
76
86
  evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=AwrtuC_6o2Wa1zGnZ080OCuWv8S-hwvGHJqZ7KPQwoI,4328
87
+ evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=BLU3G7IB3gmIYiXtznzHjPIrvi65nYZwqSF7FFnP7Aw,4324
78
88
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
79
89
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
80
- evalscope/benchmarks/arc/arc_adapter.py,sha256=BG_VeTyN88oXu7qquhva2ou1I3-RePzXLxQCsY_ne2M,6682
90
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=OO2khZxfgsRzYk64zLvq4yEbgPdQuvbIVPO4t0E4Hcc,6703
81
91
  evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=FBwkxfnbyXgTiFmwKA5mjIOb_eOuUnXrijM4rrBHZE4,6672
83
- evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
92
+ evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=iJeIh-xiQbuc3E8ea48DTCfDW_KnlGMdTeIek5AlKnk,6668
93
+ evalscope/benchmarks/arena_hard/utils.py,sha256=kRgKXdVt4Ep3XGOzUQpf9JThnp1OOt8oUQhvQEtOzRY,4596
84
94
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
85
95
  evalscope/benchmarks/bbh/bbh_adapter.py,sha256=IFu9XctrLNJcIFXK4jV3LmyqQCVb66z8YhL07Osc1TA,8623
86
96
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
@@ -111,19 +121,19 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
111
121
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
112
122
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
113
123
  evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
- evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=MQPlfMvTQYHA4EP5g7eNzXDs4A4QvgYOiGC458Z39q4,10080
124
+ evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=ThDOYrJY_RdXMLSC1S9lP-8zYd1syZWpcrXXV1ZPLVs,10100
115
125
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
116
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=jZNOtaTwiyXAA6wQ8udXKyOo-f2mKOPjE6q7mrKCPXQ,11639
126
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=V_TC_E0lKXaFcV_qIdrg2_iddmGJ4um8iIdaXVaK_EM,11146
117
127
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
118
128
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
129
  evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=kaZ8fZK2a9oVwpGRUA3wz3FkxtcTY_FkRDYrdLjDNro,8433
120
130
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
121
131
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
122
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=QdeXKS8TdEua8lWWjoNOLvSB2fN3AKa7pKV0xjwmwME,10596
132
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=3oh79iFR006vnlpwjsRVO5cl6pOav00I5uU98DPCORM,10119
123
133
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
124
134
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
125
135
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
126
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=yubOKBm8IqskyuEYkbUDxdkUCmVJE1-yB5SxxMWyHjA,7004
136
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=ZBIZJZDSy-b9lTgm2-ZU2pEh053rveMwccI1fu6xpkc,7038
127
137
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
138
  evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=z_wbrA4yJoMwfg4TJkvEZB2aV5cPFcxCZ3JIj49F4Do,2604
129
139
  evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -135,10 +145,13 @@ evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46
135
145
  evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
136
146
  evalscope/benchmarks/frames/frames_adapter.py,sha256=xYvxGzqj_YPDSZYogP9TxUhOxvZFbud1S2SOvz1nlDU,3136
137
147
  evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
148
+ evalscope/benchmarks/general_arena/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
+ evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=j2aDzikz9obxvrR-damdvSCXR0rfjEo-OzX8vujj2N0,19887
150
+ evalscope/benchmarks/general_arena/utils.py,sha256=u0q4FNIOFka1_gC344OCvBXUz89Ah6M8asjIXbNSweM,7188
138
151
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=r2qLKe8esRe45t2CoYzDiZXlq0zO6jVR-iiqLvdmn7Y,5160
152
+ evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=cPN-p0tndjocQYqfc6OFkT5k8KL7kkVklmOtps-F08Y,5391
140
153
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
141
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=NFeV3rPSfv7_imlEnCI3oSi7aSJGGX2JDqzgvyLVOFw,4861
154
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=dpIGe635CoW4ejVohVwcarBxSckqvlnxcJ2ElpRlQ9o,5669
142
155
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
143
156
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
144
157
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=J6RfxpUT1l8Jj3vT_Vtsn1z8MKCg32XTlKn_eihCI50,5071
@@ -147,40 +160,42 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
147
160
  evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=IBMdsvQ1w45_raCiACTBm7DVHtOYfckv8x15_OXIwTI,10752
148
161
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
149
162
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
150
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=l4bHGYaU66ga9J09_QTrrqM9zrzA7mpwQ9Ul7Uy47ig,6176
163
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=kgHz-n8_93J8DdR7XBlzfM2KDRoKcvg80h6CCjWv_Xk,6191
164
+ evalscope/benchmarks/hle/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
+ evalscope/benchmarks/hle/hle_adapter.py,sha256=ts38e-AqtUcbfc6VqRtWLacZDh7KzSm4rj7xKm9vTFc,4445
151
166
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
152
167
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
153
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=O6muXpiBrQ9RGSglnl3gS0yO6BSkQtXASMR9yXUfhEE,5515
168
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=ZqNG3L8yMY44B7HleUjlSbVG-GLk9RBsvaGWOm2fQVw,4788
154
169
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=C7Zww11RGbPzlB7dy-mef-2uHOVXFTdLc5W48_PM5xM,2172
170
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=2oStqiTD4w2f2n0kbjcbg7GJQfKCsHFieokQcNndWb4,2041
156
171
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
157
172
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
158
173
  evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
159
174
  evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
160
175
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
- evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=ZwUWpVe5gkEC3l5wTo-XdePHiDjQbHDhX2W0WTS5mC4,2715
176
+ evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=kYXKiiFa_F5Gl3mIOAtKxXW5myi0VW_XGidbSjArd6M,2730
162
177
  evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
163
178
  evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=iqmVUMZmyRhzOOXXQ-NN9P1nGvvbzTjOSEp6djbN_rw,6503
164
179
  evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
165
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=IHqEjfK_2O5Tk1kvWJCOcnEGIVW8Ujes6aLVm5YnkEg,3789
180
+ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=a4Vz73V1q8A0tV_DeKnTJKVxnDWmXs84diaqSym8gLM,3550
166
181
  evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
167
182
  evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
168
183
  evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
169
184
  evalscope/benchmarks/live_code_bench/testing_util.py,sha256=abjlwp6HDayf88mMI_daOKm06nEOeNBaMkmGWqk2DJo,17286
170
185
  evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=WXpieeLsr_BRd48fSHswdKvO2uUGYNDNfB4FyReDW9o,3134
186
+ evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=aibJmtIJkpvWlyLBiiL7TCdjUGfW8pxkAU2KQEZDIPM,3149
172
187
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
- evalscope/benchmarks/math_500/math_500_adapter.py,sha256=qrfqXrSSBJ0JzkhMg_6_gZtK6eWyMtgr_WiFqtssQ9c,2290
188
+ evalscope/benchmarks/math_500/math_500_adapter.py,sha256=Oc9XnBgMAjEerYAk3GtY2TTKm1QH_UI896kUuW2_a5Y,2324
174
189
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
175
190
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
176
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=NlodlICpGVz9_MjRn-FfCMGIfmEPBBXgMtczcxuvRlc,12090
191
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=Rhi-J6oGWawRVBk38ZgXk8-XrZ7wL8sf4zrncU73jgs,12111
177
192
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
178
193
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=zAW3lvWXkGqYsPbVfMj5tc5EuDXLCGLFNPT8sLcKuO0,4539
194
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=uglOOZBZfQBIuJOG7iT4THk2LNcfHQoakxQDpS4jB1U,4554
180
195
  evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=8FRC1lQX-Pv5Tji1Lsp5Mr456JvtGT1lU9c3hVO25l4,9871
196
+ evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=fYtAVKKGGfzRnDlEzU7IULruj2vYzey9aWoyZBBeftc,9886
182
197
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
183
- evalscope/benchmarks/musr/musr_adapter.py,sha256=lh0UrE3yqWzmOw_ALkxJJ9AbBn11HlQMYHO39P1HAnE,2676
198
+ evalscope/benchmarks/musr/musr_adapter.py,sha256=YTRFGsVuogdYlZoylfD3ij4AbyYrvT4hpY7MueVfu6c,2691
184
199
  evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
185
200
  evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=AybH_Ka2B2WCh-EvwAsMPlCGzJ78dHBhe5sJ6nDgNK4,15691
186
201
  evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
@@ -189,7 +204,7 @@ evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcw
189
204
  evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ULuXG68ifTEc_ucH_cj0p5AGdbL-ahA7kcJ-AzYVmSM,3767
190
205
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
191
206
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
192
- evalscope/benchmarks/race/race_adapter.py,sha256=JjIGGthWbktrsBL68rE-hvVY9ZOwKrrZzJoIdBdNoWg,6614
207
+ evalscope/benchmarks/race/race_adapter.py,sha256=FW_FSUGq5Iyz2cTACdk3qOqDt2kXwtCpVB9FT_Bc6LM,6635
193
208
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
194
209
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
195
210
  evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=gQzrmslukHOJf-VBSnVKYddIg34EEOvQuGYTurQgBy0,9289
@@ -198,18 +213,20 @@ evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=CQxRszzUrSIygOSd1G10
198
213
  evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=ce99v28wkhlGnfmihwpv3ikTqy3aumT8Jzm1LGxz-ck,10147
199
214
  evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
200
215
  evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=XZb0CN83YbfH2dF-iIV-ciNLbIb3ON220qHe7zf8KF0,247
216
+ evalscope/benchmarks/tau_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
217
+ evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=5_VgRUtEjeZ-8gRZj4cnwwso1GUqf2GB49AlI4xqyDM,4221
201
218
  evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
202
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=_QNncuCCMhhjsWzB934sYF-k010fKUdhhAOWrJ9LKDA,2813
219
+ evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=fy6Hb84cm6s-pOoQXmT-N8D1OUYVGCuq77-2xwM_WLA,3093
203
220
  evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
204
221
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
205
222
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
206
223
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
207
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=xvgt3SQQ0g5qT_RkZ1YOoYPxDS_CZrBJbDIKQjF-xEo,5328
224
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=IT5l6cFzZQi2i68kp8rWBdXWxiDVd14MORgk-lusPBM,5516
208
225
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
209
226
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
210
227
  evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=xY4Kr-GzyyE_TWGlaKL5mo9qTaza0frWLy7EgIwlZn4,12958
211
228
  evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
212
- evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=WSJv4TDLISUy66e_PZEfjrIwsQOhgPXqeyA30nBwetM,2194
229
+ evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=GkmTsrlpU1IA-E7dJXmsHXyY9ivRbmbeVKxFmMwWtLc,2209
213
230
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
214
231
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
215
232
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
@@ -221,15 +238,13 @@ evalscope/collections/__init__.py,sha256=3v7tVLcJk86FeNBrxw3pWhu_lcpKYrnT_dDACCe
221
238
  evalscope/collections/evaluator.py,sha256=RJ337S0sy8dsV25I2OAxeWgSx_HrmXTyuuHKSt9vQtM,17474
222
239
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
223
240
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
224
- evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
225
- evalscope/evaluator/evaluator.py,sha256=pQ85iNgnA9ME2b7UNH33uybcStjSQffJTh55ZFqwCNk,22115
226
- evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
227
- evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
228
- evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
229
- evalscope/metrics/__init__.py,sha256=g96dZSt3Dh56TdVbe4yDqcfmr9DoLqH-R2__3Qvorjk,1497
230
- evalscope/metrics/llm_judge.py,sha256=O2IaJpsBe1HqfCVnRYOt_PLWg6w85DYlYLU7yTq5idw,4384
241
+ evalscope/evaluator/__init__.py,sha256=XqPnEp5MvfRwC5M5cEeOAC0-MMEPxBIESqiSa3YMBgo,84
242
+ evalscope/evaluator/evaluator.py,sha256=HKEF2k0S_dJR8cF9lrqf_W4diXbb6H3L81pD6XcmLiA,22481
243
+ evalscope/metrics/__init__.py,sha256=CH3bNyRx9dJ3gOqNwKDlaZ7zan4MShM0h8SnzarjokU,1851
244
+ evalscope/metrics/completion_parsers.py,sha256=56ZNzOfNU0O1ba9fs9Cyi4Vk_YUmcgWUbxW0SJ2KrlU,8974
245
+ evalscope/metrics/llm_judge.py,sha256=1hPFnGc3Szszqo21O618a7mxOgkdba3KsbZ66vvTbSA,8380
231
246
  evalscope/metrics/math_parser.py,sha256=JtOkj28XOtwoUACXOXLzCeRYz0rx0tBsQLQDU8cbC20,17311
232
- evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
247
+ evalscope/metrics/metrics.py,sha256=OLfvEljGbQnv-bBiFD-GR2On4mpZ0xhKxiKkjZfoDX8,14268
233
248
  evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
234
249
  evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
235
250
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
@@ -335,75 +350,57 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.p
335
350
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py,sha256=LqMHlUTy2LEzoVwjALtrAw0UYmzIuHnFjQiVmn5nv-I,605
336
351
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
337
352
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py,sha256=XzebAHBAjOpkIMZm43dd55PESgmyq_J45Ji6bogYR3s,11204
338
- evalscope/models/__init__.py,sha256=yB4NuKvSd3Jd4GRQvJeGPxwigd8RJErdop5PzSQhsMY,1565
353
+ evalscope/models/__init__.py,sha256=x0Sna8mbujdOVqIYSGwIULbiPOue_Ifp-2JElSZsuMs,1481
339
354
  evalscope/models/local_model.py,sha256=UWsmZlWpT8JNGjijzZQKirvq4YywBkKOS9G-U2cuxAw,4115
340
- evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,6308
341
- evalscope/models/register.py,sha256=WiylzfL-vb6Bl3H3_RdIaBabVOAc9tiuhsQzYJDVzTg,1948
342
- evalscope/models/adapters/__init__.py,sha256=zmldx8yC_KTI8NDRcxNLyPzv19wc57UvOVvzwyuYnG4,647
343
- evalscope/models/adapters/base_adapter.py,sha256=TfINK84g4mqmHcnqvvHmk-MXRN2Pkan4yVlVd4j0nVY,3166
344
- evalscope/models/adapters/bfcl_adapter.py,sha256=KtreuJ21X1lcUGGhVgW3U62p3P65_oydMdBPtE5um-I,10332
345
- evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
346
- evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
347
- evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
348
- evalscope/models/adapters/server_adapter.py,sha256=tS-SurglnYYuAyXikR-550pE48KUVGpNoeZ8G_y47yA,9602
349
- evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
355
+ evalscope/models/register.py,sha256=G35J6BULFWwuqZO_rTkKBru1llZAyfPztcAASp_cb8M,1257
356
+ evalscope/models/adapters/__init__.py,sha256=WRaZsHlnz0MvGg9Jq565-XJjED-4cAyu4KbmrOhrHO4,688
357
+ evalscope/models/adapters/base_adapter.py,sha256=P4aicNmz1nsX9QLY9t4c6OIQPzIYfOhcrqjlAjR-ENY,3477
358
+ evalscope/models/adapters/bfcl_adapter.py,sha256=cG0vsQ3H2pmabo6tC0Y5Gonw0ng5-RFljDyRBMSj6xE,10422
359
+ evalscope/models/adapters/chat_adapter.py,sha256=epxA_on9ipsak8Lnkweh9en2AjVm5G0L1ARXYmDEEbk,8026
360
+ evalscope/models/adapters/choice_adapter.py,sha256=wIXnDcgnKaIMdhToaqy6fidhuZDpEz2vhxIB_V9u3Z8,8203
361
+ evalscope/models/adapters/custom_adapter.py,sha256=W8DIBiMWvHHcc0Mn9Frjj1YbpHRi7w-UQVJDiU2PakU,2400
362
+ evalscope/models/adapters/server_adapter.py,sha256=W6SXrPy-hZXpnISDjupu_j7bnmt-cP55sDojPXThitc,9701
363
+ evalscope/models/adapters/t2i_adapter.py,sha256=d6OviQFi_uN8PPXKrFpivk5Awm1O6wd_Gii8t3hVahY,2806
364
+ evalscope/models/adapters/tau_bench_adapter.py,sha256=jYGaj2L2wxtEiTdiSwZdY1XNkSzm6os7IvkxgK4msR0,6889
350
365
  evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
351
366
  evalscope/models/custom/custom_model.py,sha256=rBccFVpCIfTGt9cgXLcxeUWc7w1sTRtbTO5w5qqQIQE,1405
352
- evalscope/models/custom/dummy_model.py,sha256=aZg_OZ6yFNg2macxS5iCymIdFHODdQGH4OOwMXQe4SM,3113
367
+ evalscope/models/custom/dummy_model.py,sha256=WpfrS3kvwRRdyThx9baaJ5vodYYh29VGRKsGKMWFflI,3124
353
368
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
354
- evalscope/perf/arguments.py,sha256=uBKqT_s5aG3a295MxE2VIzs9_8XXxhenN2TdZbsYXEA,10865
355
- evalscope/perf/benchmark.py,sha256=cjUpJ3SRnZVBs_H24yqLh4WG_hcCADrniLG1VsmByb8,7901
356
- evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
357
- evalscope/perf/main.py,sha256=yfJWGd2l4uU_qKW9bD6DzV0DK9XXuCJGLYjF_JWR22E,3394
358
- evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
359
- evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
360
- evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
361
- evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
362
- evalscope/perf/plugin/api/custom_api.py,sha256=ssE4J8AynA0n5SnXSQyk7K5Co3dwUN6Opph08clZna0,3785
363
- evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
364
- evalscope/perf/plugin/api/openai_api.py,sha256=PmjBfIzzSuzcKiVOUeA2aPxihV0dZEzFlgmbrD2isME,7773
365
- evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
366
- evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
367
- evalscope/perf/plugin/datasets/custom.py,sha256=-meul2hRmYvYAo--c_EtCnItRi5DvN7xxFOpq6vqdts,1346
368
- evalscope/perf/plugin/datasets/flickr8k.py,sha256=MbJKEB0XqZE0nDEenwYs0FLH9QL658Vn9uQmUH4hPvk,1605
369
- evalscope/perf/plugin/datasets/line_by_line.py,sha256=AqZYG6tVL3BIGnzh_2Tev8lDYezJG_1gqJY8bSNQl3Q,957
370
- evalscope/perf/plugin/datasets/longalpaca.py,sha256=XelLris0-c3StLInQ-Oav4jqGcXPNfJxEDeYvaetEbI,1297
371
- evalscope/perf/plugin/datasets/openqa.py,sha256=4Pnx5duFJzoiTUfZCbcK7LO8f-skmcpYNUUrtNR_UUc,1463
372
- evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANBGCSgSExFbscLwSM_Gmk,2958
369
+ evalscope/perf/arguments.py,sha256=lG2IOOzxg29pdnF6IobzPcqEcYqopulFpVU2QzRaEJA,11429
370
+ evalscope/perf/benchmark.py,sha256=ZVmsSeKDUKkApt3y5tIMMFZAyAj3UNVT7JPp1fh5mhE,7880
371
+ evalscope/perf/http_client.py,sha256=l_OKL80kTP6sM_PEBvsJ1_TejYJdUQnE2UlB-ud1WQM,4588
372
+ evalscope/perf/main.py,sha256=WZbBgFhIj9KqxzC7_NZxDlou019_EXatsHRt5vqDhFg,3439
373
+ evalscope/perf/plugin/__init__.py,sha256=Ztj4h1_JYJqbbWkeuDTj5aTRyGQf5Woc4xEIyjcokVU,94
374
+ evalscope/perf/plugin/registry.py,sha256=GhLe-h1rGzya2bgIUaV5VymQIaHqI7h5SG_i4PoGAm8,1967
375
+ evalscope/perf/plugin/api/__init__.py,sha256=7RsGdYTSfnW6iVpveEzNu8v4x8Yc8H-Kk39DqOHMrd4,152
376
+ evalscope/perf/plugin/api/base.py,sha256=9cX4xwTzy5ycnWqmQqRGMLasTEX6jVlobtADkh1KwXE,2782
377
+ evalscope/perf/plugin/api/custom_api.py,sha256=f8rUixcV9mTxoYyabu3wedEC4YVB70Yw6Az1NpfeWPQ,10375
378
+ evalscope/perf/plugin/api/dashscope_api.py,sha256=Miv2pzMa6sxZyYYJhCzcbOI_QHuZx7tazKpb6Not7ck,3627
379
+ evalscope/perf/plugin/api/default_api.py,sha256=kjuHQ-zRHe5WU4ofSzWBpWbIxBQBOh_ucu1z2g62gWg,4315
380
+ evalscope/perf/plugin/api/openai_api.py,sha256=Mt_VedJUaCH3g-oVSJ_fsGcPk0KkspSzIMkrkih2Zb0,7777
381
+ evalscope/perf/plugin/datasets/__init__.py,sha256=qzeQ9BrJhiJJm1wHaFeOQkvXXdSd15Ucspbn5zjs-6Q,495
382
+ evalscope/perf/plugin/datasets/base.py,sha256=-3Ihnp2hYvZyPnP8Gh2Pu8ovlLNFHyZnNgRu3WHG4d0,2714
383
+ evalscope/perf/plugin/datasets/custom.py,sha256=UuOk8xYfSYyyYZL3U4grUjtfQhWHHZeAEC63n_4Siuw,1376
384
+ evalscope/perf/plugin/datasets/flickr8k.py,sha256=IXz5uu5SlqF1l_tJ_ITr2vx_R_d7gxWzqPuyEOx7rYo,1043
385
+ evalscope/perf/plugin/datasets/kontext_bench.py,sha256=XjKzr7nMzI3cfk83IH0PH1TNJaQMRXUpACnzFfP2n6g,1091
386
+ evalscope/perf/plugin/datasets/line_by_line.py,sha256=c3ydW4GqxkG0vl2g64jG0vBMql2FuFPyWh3mgkIh9Do,987
387
+ evalscope/perf/plugin/datasets/longalpaca.py,sha256=VnMjdHl_JV3NmZ6wRxVlJ99e8PYSjQTcVxoTkl21Ei0,1327
388
+ evalscope/perf/plugin/datasets/openqa.py,sha256=33AR419IrH-FxZRjjcYdAIEZXaX4TKEoirVVfX--N9I,1493
389
+ evalscope/perf/plugin/datasets/random_dataset.py,sha256=NNAXvgFPkLDOSpYNex1DyE4X-ELtQRm13_oBooO30j8,3514
390
+ evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=F3yA9Ih3YO895lZKCo3i85LeKTzjvGcvhzc8UNN-gUI,3240
373
391
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
374
392
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
375
- evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
376
- evalscope/perf/utils/benchmark_util.py,sha256=EPKUDijue85b8KhSJoJKLh6comkTKRjq2yoEw4kxBho,7227
377
- evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
393
+ evalscope/perf/utils/analysis_result.py,sha256=aoT7JD2zAzBeuZUfncKhJ2odX_7KnymwOmNB1Upam2c,935
394
+ evalscope/perf/utils/benchmark_util.py,sha256=7bHpa5oaqcPJX7DSUkzK9assoFSHC27Q7-QylUOiklQ,7136
395
+ evalscope/perf/utils/db_util.py,sha256=TCdmoEx5iScL6h8wzucPojPwn6J1wTmQqX4sVk-ilHo,11630
378
396
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
379
397
  evalscope/perf/utils/local_server.py,sha256=RL9rGd5tEniZ0aErhHcbVXMX22YmujfE11T3j37VL8k,4684
380
398
  evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
381
399
  evalscope/perf/utils/rich_display.py,sha256=xZzeryQbYM6Cv8g1ulK6OQUE2CalQ_KtFxiy7pioeEU,8127
382
- evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
383
- evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
384
- evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
385
- evalscope/registry/config/cfg_pairwise_baseline.yaml,sha256=d05pBiqOk1ejcdd9XE-opZ_ersyttAesF3Iwa2df8O8,3580
386
- evalscope/registry/config/cfg_single.yaml,sha256=zjsUC3zhU8z7JURaJiz7npkUbFpP82q1ycqUmObC-hc,3056
387
- evalscope/registry/data/question.jsonl,sha256=WQw5FXvFYerdfwPK1L4YwrWX-TApeAr2X4Zxjznq-oc,12885
388
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
389
- evalscope/registry/data/prompt_template/prompt_templates.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
390
- evalscope/registry/data/qa_browser/battle.jsonl,sha256=2MXcYoMItBmttQxSMh2Oa0x51xxqJaWEgSuERUx1O_0,1185590
391
- evalscope/registry/data/qa_browser/category_mapping.yaml,sha256=3r9nUIciW9205qbtOQF7aI_etM191cM3vlWU8ueG2Co,484
392
- evalscope/registry/tasks/arc.yaml,sha256=MghUuCmZPEwGqwYhA8ClRWHiSwC3kbHcKMRicQl9aqc,765
393
- evalscope/registry/tasks/bbh.yaml,sha256=GE3PpE8zw_SROj41LZ5bTm6ZXXZjYOorAdwBCTEePXM,604
394
- evalscope/registry/tasks/bbh_mini.yaml,sha256=8o9ZiWaCTkN2uTwiOhjBQuyKm7GUw6ZfUZxb2bkOmvs,678
395
- evalscope/registry/tasks/ceval.yaml,sha256=XDaszb7DROKk8nQDiklirTvDJwkOUJtIN_tcUFVvIJk,703
396
- evalscope/registry/tasks/ceval_mini.yaml,sha256=4aYW4c0IzgAXSs5dp4d8dJ0OHVp5sD4uiRjChjL1zZg,672
397
- evalscope/registry/tasks/cmmlu.yaml,sha256=yOgKl1jmfcAfTuUcIMmG5SQhkrbEHEyyP3YuCuIN3l0,703
398
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml,sha256=egdiM5oG7RSs0M-g8QNikwhJ9tZVgw5FiLy-rIYYHAA,737
399
- evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNiAhGEdUqL-8c,702
400
- evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
401
- evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
402
- evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
403
- evalscope/report/__init__.py,sha256=mLCgT7G-WPagQHOGz97AOdLQJjyikrswDiXA8d9Wr_Q,923
400
+ evalscope/report/__init__.py,sha256=DIoXbj0mjs1m2kEgFvIyqy4skDuoBu0UDVmTDa60Ymk,905
404
401
  evalscope/report/combinator.py,sha256=4ahUtTFPTNiSjamldX3IcLf33yKTJKs6ZsC4fsCafe8,4192
405
402
  evalscope/report/generator.py,sha256=oykmQROG-Bt8ttCH4RtvmGJ39HmDJMTU6gG26lg5LHE,4321
406
- evalscope/report/utils.py,sha256=A8_bo-97UKA7Ys5slZ4TydCno9p7-Y3rxLpOd8gmAjM,7685
403
+ evalscope/report/utils.py,sha256=taTSLvMKzAtJ9oha7pe0WF2UZZfEqPQgdj4urq7ZJIE,8298
407
404
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
408
405
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
409
406
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -439,40 +436,39 @@ evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP
439
436
  evalscope/third_party/toolbench_static/toolbench_static.py,sha256=xE__eXvSwHmmSh1tXNvyBo6MCO4mDlYTbIYl9OGEfNI,2120
440
437
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
441
438
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
442
- evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
443
- evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
439
+ evalscope/utils/__init__.py,sha256=OiVmYHVkC_d8s6Zp1I6p6oTyhCEGvN-I9E6uzn8dgF4,1940
440
+ evalscope/utils/argument_utils.py,sha256=D7qOH85wf7LKh_cJ2X51OEaL7CMaddydmHZkfoYpvLk,1952
444
441
  evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
445
- evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
446
442
  evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
447
- evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
448
- evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
449
- evalscope/utils/io_utils.py,sha256=atRCynX9dFcZGxCDip8HRpdzVkkTXCK6y4HzfiOEFU8,5615
443
+ evalscope/utils/import_utils.py,sha256=BSdp7RQSZu67129TBbtJvMWU0CfCFu864K31eiM3pr8,2975
444
+ evalscope/utils/io_utils.py,sha256=2eEkLx4jhekgIV4vYL8yTN0PT6dbHUERMBZwmvxuiEc,7109
450
445
  evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
451
- evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
452
- evalscope/utils/utils.py,sha256=P5gmpINv5UQrwEMrFZKZjdJspsOdGjaBARfRSDVNOd0,11414
446
+ evalscope/utils/model_utils.py,sha256=F1_WBHvBehWqrTd6kPtKICeeYucaZn5H0Gc3cCplYB8,2329
453
447
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
454
448
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
449
+ tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
455
450
  tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
456
- tests/aigc/test_t2i.py,sha256=Dqug3rV7EIkj6uwBjgj5UMj8ZrpGSznSHfn2g8J_P3M,3860
451
+ tests/aigc/test_t2i.py,sha256=XtVknpwlVMb6FSw3_WMFxMq0gZX6iG-ffdSQkcW2Fzw,3856
457
452
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
458
- tests/cli/test_all.py,sha256=yo1ysDM90dI_kWxKKPOf-BsYneeRYRJa5uh6_7SDZ3Y,4332
459
- tests/cli/test_collection.py,sha256=jIGQNQO4msJE9w4Ms5qxtuhkHVukeLcHvBF2dzHCKCI,4207
460
- tests/cli/test_run.py,sha256=98OPX7h2_fDmGtQHJWzGA2pjd_vhgKkTIEYgeLMkRo4,18640
453
+ tests/cli/test_all.py,sha256=IT0mxjiuHCC0PpT4z3oN1Bbr_0viMcm8GnShZ02kp8w,6333
454
+ tests/cli/test_collection.py,sha256=bXWzccH822Y2B1Ed251U6TE8G_osI6MXYNxzmfv9kBI,4197
455
+ tests/cli/test_custom.py,sha256=0YE-TCAeaQMRVRFla_TIvTd8d0USvvsSeqvYAD3NDNg,8796
456
+ tests/cli/test_run.py,sha256=YKX2XCHPxnStgzzP67U90RV9r1MC2GM3JoGQqfZKqrI,17324
461
457
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
462
- tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
458
+ tests/perf/test_perf.py,sha256=AEWvpN3ID6s-9MEoaZjQqUM8VVsqgk_v9KX8pDgvozA,5864
463
459
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
464
- tests/rag/test_clip_benchmark.py,sha256=uykLrRCfNR8aOiLJI0GdSL4mOys3q0LFHsA_Ur7xudc,2658
465
- tests/rag/test_mteb.py,sha256=38cDYpqf0ozvrWf36I7z_O_DmAUCbF9LX06us65xNXk,7209
466
- tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
460
+ tests/rag/test_clip_benchmark.py,sha256=13pcY3gYHNQh2KfEHCqtCSqiOcbngSJ1BlVZzI58JCE,2694
461
+ tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
462
+ tests/rag/test_ragas.py,sha256=5qozXvPFIb67T-igJv87ijlOgkPnqgkkBVXu6Ht4D0A,4554
467
463
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
468
- tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
469
- tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
470
- tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
464
+ tests/swift/test_run_swift_eval.py,sha256=YbIhYNoI4kAB-ox-OXAKUifLIXTFqP-xGZicrAgK_V0,5784
465
+ tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4VlwL03atI,4934
466
+ tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
471
467
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
472
- tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
473
- evalscope-0.16.3.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
474
- evalscope-0.16.3.dist-info/METADATA,sha256=6PnfUd1rJfGeAJ-7KowXFFbwP8xVw7r9-3u6HRlJHic,36533
475
- evalscope-0.16.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
476
- evalscope-0.16.3.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
477
- evalscope-0.16.3.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
478
- evalscope-0.16.3.dist-info/RECORD,,
468
+ tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
469
+ evalscope-0.17.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
470
+ evalscope-0.17.1.dist-info/METADATA,sha256=1PRiimjOBZgSWjvT3iL4VcvdaWk8v3fGp9xCXLpM1Dw,38469
471
+ evalscope-0.17.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
472
+ evalscope-0.17.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
473
+ evalscope-0.17.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
474
+ evalscope-0.17.1.dist-info/RECORD,,
tests/aigc/test_t2i.py CHANGED
@@ -8,8 +8,8 @@ import unittest
8
8
  from evalscope.config import TaskConfig
9
9
  from evalscope.constants import EvalType, JudgeStrategy, ModelTask, OutputType
10
10
  from evalscope.run import run_task
11
- from evalscope.utils import test_level_list
12
11
  from evalscope.utils.logger import get_logger
12
+ from tests.utils import test_level_list
13
13
 
14
14
  os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
15
15
 
tests/cli/test_all.py CHANGED
@@ -9,8 +9,8 @@ import unittest
9
9
  from evalscope.config import TaskConfig
10
10
  from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
11
  from evalscope.run import run_task
12
- from evalscope.utils import test_level_list
13
12
  from evalscope.utils.logger import get_logger
13
+ from tests.utils import test_level_list
14
14
 
15
15
  os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
16
16
 
@@ -39,7 +39,7 @@ datasets=[
39
39
  'general_mcq',
40
40
  'general_qa',
41
41
  'super_gpqa',
42
- 'live_code_bench',
42
+ # 'live_code_bench',
43
43
  'mmlu_redux',
44
44
  'simple_qa',
45
45
  'chinese_simpleqa',
@@ -53,8 +53,13 @@ datasets=[
53
53
  'docmath',
54
54
  'needle_haystack',
55
55
  'bfcl_v3',
56
+ 'hle',
57
+ 'tau_bench',
56
58
  ]
57
59
 
60
+ # Reverse the datasets list to ensure the order is from most recent to oldest
61
+ datasets.reverse()
62
+
58
63
  dataset_args={
59
64
  'mmlu': {
60
65
  'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
@@ -127,12 +132,23 @@ dataset_args={
127
132
  'mmlu_redux':{
128
133
  'subset_list': ['abstract_algebra']
129
134
  },
130
- 'frames':{
135
+ 'docmath':{
131
136
  'subset_list': ['simpshort_testmini']
132
137
  },
133
138
  'bfcl_v3':{
134
139
  'subset_list': ['simple', 'multiple']
135
- }
140
+ },
141
+ 'hle': {
142
+ 'subset_list': ['Math', 'Other'],
143
+ },
144
+ 'tau_bench': {
145
+ 'extra_params': {
146
+ 'user_model': 'qwen-plus',
147
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
148
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
149
+ },
150
+ 'subset_list': ['airline'],
151
+ },
136
152
  }
137
153
 
138
154
  class TestRun(unittest.TestCase):
@@ -165,3 +181,51 @@ class TestRun(unittest.TestCase):
165
181
  )
166
182
 
167
183
  run_task(task_cfg=task_cfg)
184
+
185
+
186
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
187
+ def test_ci_lite(self):
188
+ from evalscope.config import TaskConfig
189
+
190
+ task_cfg = TaskConfig(
191
+ model='qwen-plus',
192
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
193
+ api_key= env.get('DASHSCOPE_API_KEY'),
194
+ eval_type=EvalType.SERVICE,
195
+ datasets=[
196
+ 'general_mcq',
197
+ 'general_qa',
198
+ 'iquiz',
199
+ ],
200
+ dataset_args={
201
+ 'general_mcq': {
202
+ 'local_path': 'custom_eval/text/mcq',
203
+ 'subset_list': [
204
+ 'example'
205
+ ],
206
+ },
207
+ 'general_qa': {
208
+ 'local_path': 'custom_eval/text/qa',
209
+ 'subset_list': [
210
+ 'example'
211
+ ]
212
+ }
213
+ },
214
+ eval_batch_size=1,
215
+ limit=1,
216
+ stream=True,
217
+ generation_config={
218
+ 'temperature': 0,
219
+ 'n': 1,
220
+ 'max_tokens': 4096,
221
+ },
222
+ judge_worker_num=1,
223
+ judge_strategy=JudgeStrategy.AUTO,
224
+ judge_model_args={
225
+ 'model_id': 'qwen2.5-72b-instruct',
226
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
227
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
228
+ }
229
+ )
230
+
231
+ run_task(task_cfg=task_cfg)
@@ -5,7 +5,7 @@ import unittest
5
5
  from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
6
6
  from evalscope.constants import EvalType, JudgeStrategy
7
7
  from evalscope.utils.io_utils import dump_jsonl_data
8
- from evalscope.utils.utils import test_level_list
8
+ from tests.utils import test_level_list
9
9
 
10
10
 
11
11
  class TestCollection(unittest.TestCase):