evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (89) hide show
  1. evalscope/arguments.py +3 -1
  2. evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +14 -17
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
  6. evalscope/benchmarks/benchmark.py +12 -10
  7. evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
  10. evalscope/benchmarks/data_adapter.py +82 -19
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
  13. evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
  14. evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
  15. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
  16. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
  17. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
  18. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
  19. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  20. evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
  23. evalscope/benchmarks/musr/__init__.py +0 -0
  24. evalscope/benchmarks/musr/musr_adapter.py +71 -0
  25. evalscope/benchmarks/process_bench/__init__.py +0 -0
  26. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  27. evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
  28. evalscope/benchmarks/race/race_adapter.py +12 -16
  29. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  30. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
  31. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  32. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  33. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  34. evalscope/benchmarks/super_gpqa/utils.py +90 -0
  35. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  36. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  37. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
  38. evalscope/benchmarks/utils.py +43 -0
  39. evalscope/cli/start_app.py +4 -1
  40. evalscope/cli/start_eval.py +4 -3
  41. evalscope/cli/start_perf.py +4 -2
  42. evalscope/collections/evaluator.py +16 -1
  43. evalscope/config.py +13 -3
  44. evalscope/constants.py +7 -0
  45. evalscope/evaluator/evaluator.py +3 -1
  46. evalscope/metrics/__init__.py +2 -1
  47. evalscope/metrics/metrics.py +23 -2
  48. evalscope/metrics/named_metrics.py +1 -0
  49. evalscope/models/__init__.py +2 -1
  50. evalscope/models/base_adapter.py +32 -6
  51. evalscope/models/chat_adapter.py +4 -1
  52. evalscope/models/choice_adapter.py +4 -0
  53. evalscope/models/custom_adapter.py +2 -0
  54. evalscope/models/local_model.py +3 -2
  55. evalscope/models/register.py +28 -0
  56. evalscope/models/server_adapter.py +107 -29
  57. evalscope/perf/__init__.py +0 -1
  58. evalscope/perf/arguments.py +18 -8
  59. evalscope/perf/http_client.py +8 -6
  60. evalscope/perf/plugin/api/openai_api.py +11 -1
  61. evalscope/perf/utils/analysis_result.py +1 -1
  62. evalscope/perf/utils/benchmark_util.py +6 -2
  63. evalscope/report/app.py +15 -8
  64. evalscope/report/combinator.py +2 -2
  65. evalscope/run.py +6 -5
  66. evalscope/third_party/thinkbench/__init__.py +3 -0
  67. evalscope/third_party/thinkbench/eval.py +429 -0
  68. evalscope/third_party/thinkbench/infer.py +130 -0
  69. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  70. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  71. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  72. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  73. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  74. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  75. evalscope/utils/chat_service.py +1 -0
  76. evalscope/utils/filters.py +59 -0
  77. evalscope/utils/logger.py +3 -3
  78. evalscope/utils/model_utils.py +17 -1
  79. evalscope/utils/utils.py +45 -45
  80. evalscope/version.py +2 -2
  81. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
  82. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
  83. tests/cli/test_collection.py +1 -1
  84. tests/cli/test_run.py +151 -32
  85. /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
  86. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
  87. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
  88. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
  89. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=r8gOMX6i8dWMl_WXLsBdHla7cuauBAyv9apky9VxLsE,4598
3
- evalscope/config.py,sha256=D7C_K0f0xsfzFUSNSJJUTz3n9tmA6zLDbf8pZ_9ltpw,8600
4
- evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
5
- evalscope/run.py,sha256=qfMqVWlUiXEiIJ665p3-IYWknhIeNZkCJe3Yn07Y74U,5692
2
+ evalscope/arguments.py,sha256=QT3f_oBDl1jXl68rgHVBsOxWeJTw1zXFmm7Zu1VRMQU,4826
3
+ evalscope/config.py,sha256=eQ_r94W_uQiF9ZWN-k84KxrT85E3YiJklDuM5mIKt_s,9124
4
+ evalscope/constants.py,sha256=l6xkVknVybi3frXaftksRZNaCFcw9ZJZ8ORJeWDJEaQ,3615
5
+ evalscope/run.py,sha256=ae6WsKllRt5xanRRFJWSBkVEjCf-Lgx35nlLyqOxctU,5785
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
- evalscope/version.py,sha256=h6YAZAgeAreWmKtpfr4D6BEvnWZxb1bka9hrpYOO0l8,119
8
+ evalscope/version.py,sha256=KVyRitFqvCQM-1iaU2VOfx7rh9IDqOUGstYhQ6DLAI4,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -56,15 +56,17 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
- evalscope/benchmarks/benchmark.py,sha256=IY2xYmNR58aYnZK7rnUDONWiLQopo_ZifGS2SfN2L-Q,2422
60
- evalscope/benchmarks/data_adapter.py,sha256=xCBvJe4ubgpP1J8ElcWAJwF6B5CSrBEv_uMwQzlUaLY,12540
61
- evalscope/benchmarks/aime24/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
- evalscope/benchmarks/aime24/aime24_adapter.py,sha256=FYH8NsT1nis3VoBMzRM_ueOsGNXjOKZCa6J_wpUM3RQ,1772
59
+ evalscope/benchmarks/benchmark.py,sha256=AByXFsuia3lqCLFsPRt95UR7SxwEuAGpeuKBVjb7jLE,2463
60
+ evalscope/benchmarks/data_adapter.py,sha256=JwptQHL4DbcZ_Ll0kJ0QL8rgK2ZVFftyAXiUWKcrvL4,15532
61
+ evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
62
+ evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
+ evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
64
+ evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
63
65
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
64
66
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
65
- evalscope/benchmarks/arc/arc_adapter.py,sha256=vfwAy01LA141qn1lsSyZmEIGWbbhOCRMOGoSM-K2z6M,6490
67
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=8ksPc6IM266NE7F9Bo-Y9SRZZM-tlCKPfLbJg3VEq9w,6269
66
68
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
67
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=37wY3r1qW5qdjyKF-8n7UIM0IVcpaQugMb5Rkjbppxg,8524
69
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
68
70
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
69
71
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
70
72
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -93,101 +95,114 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
93
95
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
94
96
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
95
97
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
96
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=Qz2oNGw0H_4FtfY-Izdxv9fgwxScJksyvwzeQw-aVyo,11374
98
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=B3nO0WmqSyH-LlicqreIPWrxXgVPt1rrp3ndc7YRYiE,11157
97
99
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
98
100
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
99
101
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
100
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=1RmhI0SNxHK-Fz-iTIR76zeBRDLlm0m6_7rJywqk3Rk,10446
102
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=zNaYSelcGZulgFLQXp2eD56_QOFRkaXHknfy_VWJciA,10230
101
103
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
102
104
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
103
105
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
104
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=i0E4TNajMVcWT8lc5haIjKvdmHuI5qzgpssIm5Fw7bs,7413
106
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=F2YCaNDn49X82l06WlLFp2OPFB7nv0ecW40099I9iSE,6871
105
107
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
- evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=o3Q6ke-RLx4qUbF5FgASZogv3-kCJ6qpK43F_LARU3Y,2496
108
+ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS3-z03YW8nafooFJ7x60e5uEpBO5z_c7zk8,2450
107
109
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=vDHgsWpsIZQWNadl3mI8M3rDKkvPM2N2KAkW-8aeOHY,5130
110
+ evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
109
111
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
110
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=fu14ZzGYyg2MEdJbxZGBoIbais6xA9Um2BEAJTvBZZM,3823
112
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=wnKUIVc1UvnjI5XGOHf5aCx0H0xTKoZZWAD-Q8AJNAE,4686
111
113
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
114
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
113
- evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=tiy8Cn1ZmNKjVg8lqNAxWBbsKp8h0uiDNpWuHfcID0A,4689
115
+ evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
114
116
  evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
115
117
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
116
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=4qtMX_SfqkXRMgGLOA6tNGMK9EkITWbjLlJT9gWbT20,10664
118
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
117
119
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
118
120
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
119
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=qArX2umdrYJZkDA9i3XGBGljCton99v5Yss9be9iZYw,6269
121
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=2CnrIapK51l4bQyFKWWqmOaeBSpkIlq2asetWcp24gs,6057
120
122
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
121
123
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
122
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=onacZB_6SF9239Ly-U70__WYsinS9iWpnf3oiYMNxKc,5164
124
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=YK4u3JG_Ub4vP-xnsrf-lMheIBdCgFWmirhPUch3biU,5120
123
125
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
- evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=3HsAdNj5JJGCFA17sPXi-59yv-pfcB0UeXKdY_mQcwU,2015
126
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
125
127
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
126
128
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
127
129
  evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
128
130
  evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
129
131
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
130
- evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=nv4mzKOPp1YPcr6e7daZuZyQ3jRNNG6PUzi38REuwSk,2356
132
+ evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
131
133
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
- evalscope/benchmarks/math_500/math_500_adapter.py,sha256=mBzsllop5sTHw-uK04FjhEWDiEDjDaNUFDUBIVN7Xgg,1742
134
+ evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
133
135
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
134
136
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
135
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=pmT1y9dbWJcZK3U6hkXa3-lBDABx7DhQ7oHc3O-Nkg0,11769
137
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=RMZoHAApVOpD3_NeHLcsiM7SpglKpfrGSUhBWPgdAVE,11525
136
138
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
137
139
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=Fdrj26MfYmPzio2tI23WTcofrwD69_m41mkVpvlxzVU,4815
140
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
141
+ evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
142
+ evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
143
+ evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
144
+ evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
145
+ evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
139
146
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
140
147
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
141
- evalscope/benchmarks/race/race_adapter.py,sha256=dC9I-3T9UFh2OVpmWKRmSszPOlFZAZ40xOPa4zN3daI,6661
148
+ evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
142
149
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
150
+ evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
151
+ evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=FZwXN78X2fV3Dchop_UuFAhNFkwWs12qJlIczgvvrJ8,477
152
+ evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
153
+ evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
154
+ evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
155
+ evalscope/benchmarks/super_gpqa/utils.py,sha256=uhANVnoIaH8-QuzjcVuyVB-8aGOMy94XKUF-TFemY_Q,3578
156
+ evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
143
157
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
144
158
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
145
159
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
146
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=GVuJT-Xz4ugVtcUSTRxcBgViHVowcqJf3yVsotcZoZI,5062
160
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb5BSyjO2eD4On6gX8xqlkV8,4961
147
161
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
148
162
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
149
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=6rT1zuQh0nLuYymcchO-cMP98EY0vWizbfTfnUERWgo,12905
163
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
150
164
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
151
165
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
152
166
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
153
- evalscope/cli/start_app.py,sha256=_NTmCd15tZOROAnPacGWirMS4OXHrL3n2eZj1kokpks,758
154
- evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
155
- evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
167
+ evalscope/cli/start_app.py,sha256=WTbba_Iitz1jkQ5n6KHRH-i3U7qJIM7iCi4a9roWjaA,808
168
+ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,775
169
+ evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
156
170
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
157
171
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
158
- evalscope/collections/evaluator.py,sha256=FJx3KGdLi0-TIqWC_067HEmA4P298BKdwHIrbcai46M,12065
172
+ evalscope/collections/evaluator.py,sha256=Zi3uRZhSRIimYye_apZWL6VOiHqaM5znbFA4TBvqSbg,12761
159
173
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
160
174
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
161
175
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
162
- evalscope/evaluator/evaluator.py,sha256=E0NiP5O56WbF8eiUmw9IY2ouotRog9H-2SRyTzZld0I,17569
176
+ evalscope/evaluator/evaluator.py,sha256=VIiw1eI46UOsFWNd7schD4ah_Q5ll0crl2sRmGIRmig,17649
163
177
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
164
178
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
165
179
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
166
- evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
180
+ evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
167
181
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
168
182
  evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
169
- evalscope/metrics/metrics.py,sha256=r4FHyEvvFhMu0vAHBw-ByFefObDBC3DQdr53klSk6Wk,13325
170
- evalscope/metrics/named_metrics.py,sha256=SeBXmgWyK4y4tKiGKro3k-CZU1OShuKe6qxwpT3tizY,1313
183
+ evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
184
+ evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
171
185
  evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
172
186
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
173
187
  evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
174
188
  evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
175
189
  evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
176
- evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
177
- evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
178
- evalscope/models/chat_adapter.py,sha256=nOrNDuvuNKkTcW9zNcR_EIqbzkqK5PFws-5YsSxBR9E,6120
179
- evalscope/models/choice_adapter.py,sha256=jj_6KB1BAsvv4Yufn2bM2tCiLovFUum2368lseogmb8,8036
180
- evalscope/models/custom_adapter.py,sha256=Ed_MGEcZxKK4mkXTpUY4GXTsayprHzIEOC1L9gqwjf4,2284
181
- evalscope/models/local_model.py,sha256=s0YVX9Djqazusk7qzSpWQB76jGGuzJxqQlZzomsCFsk,2621
190
+ evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,1000
191
+ evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
192
+ evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
193
+ evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
194
+ evalscope/models/custom_adapter.py,sha256=Za52WF1I_YcJkGomJ6s9sP2Fs8DoJ4HHBYBi3iC3WNI,2379
195
+ evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
182
196
  evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
183
- evalscope/models/server_adapter.py,sha256=iVJuUJlHGVGxnlrDMnbHZ8WQ4OR2HK5HrXH4obD2_cg,4173
197
+ evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
198
+ evalscope/models/server_adapter.py,sha256=dS_o9_iC8QY73AehIekYwBQieFECZ97JRfbfleJ-Dtk,6845
184
199
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
185
200
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
186
201
  evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
187
- evalscope/perf/__init__.py,sha256=rgSXzxIJ67yB_SLUdl4ljem2-ilB-Gw3640f4KWLO1k,51
188
- evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
202
+ evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
203
+ evalscope/perf/arguments.py,sha256=u3GNdnOBmiEirtgJLspsLO7qBwHeWLoXd4vlt69jJ-g,9717
189
204
  evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
190
- evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
205
+ evalscope/perf/http_client.py,sha256=eoRPaBTCVC4DpgH4tnc-31_h_2PVkWUwCLWK6_TTkhM,7282
191
206
  evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
192
207
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
193
208
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
@@ -195,7 +210,7 @@ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2m
195
210
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
196
211
  evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
197
212
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
198
- evalscope/perf/plugin/api/openai_api.py,sha256=JxQGlzAbM7MBWcr3MvWiAg6E4lqdQLfkk1qK0vUWvn8,6817
213
+ evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
199
214
  evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
200
215
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
201
216
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
@@ -205,8 +220,8 @@ evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYF
205
220
  evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
206
221
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
207
222
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
208
- evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0awWiXEIkCI,1212
209
- evalscope/perf/utils/benchmark_util.py,sha256=T_pXpSCwCNLJgfzgv3IO7kG61ghTLthVMsXZhBCGP_4,5541
223
+ evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
224
+ evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
210
225
  evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
211
226
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
212
227
  evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
@@ -232,8 +247,8 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
232
247
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
233
248
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
234
249
  evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
235
- evalscope/report/app.py,sha256=adP1rVVOxYMbCTdopV3FKWBhUzB7t1AXcDOxW4Ct56g,26647
236
- evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
250
+ evalscope/report/app.py,sha256=cvof2Nm4ORxC4D3L22Kg3Ngu3kJwBZlfnFJkwDMCmSQ,26881
251
+ evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
237
252
  evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
238
253
  evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
239
254
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -253,6 +268,14 @@ evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odT
253
268
  evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
254
269
  evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
255
270
  evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
271
+ evalscope/third_party/thinkbench/__init__.py,sha256=C0aSu71_dc1upUVkKmq2VgDd9plpRcYUdCE6BjUWJcA,110
272
+ evalscope/third_party/thinkbench/eval.py,sha256=76G4LTkxqWCDCyj7Ahjj-qjO1gFem1uDzpRAC27ICl0,18896
273
+ evalscope/third_party/thinkbench/infer.py,sha256=2L4DAJKn3wAhNEKnKudQT60igGOJSKH80FR4nS7DHYk,3952
274
+ evalscope/third_party/thinkbench/resources/critique_template.txt,sha256=d4Egc-qH--4lG8X_EcmgymnuZgiCMbee1M5pt4HrRKA,535
275
+ evalscope/third_party/thinkbench/resources/reformat_template.txt,sha256=zTZyVAzmMBtAwI9lHly9EXsqX471OW-VTg538PDcB30,1775
276
+ evalscope/third_party/thinkbench/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
277
+ evalscope/third_party/thinkbench/tools/llm.py,sha256=HCFh58_THsVrFVzvGoThwWRu8EbPXD0DotLQEj5u4Tg,1353
278
+ evalscope/third_party/thinkbench/tools/utils.py,sha256=rDu2GVTK4ji9Yh9RLVksZqrfurQsSuN9GW3QCKJ60ng,401
256
279
  evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
257
280
  evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
258
281
  evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
@@ -262,20 +285,21 @@ evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo
262
285
  evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
263
286
  evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
264
287
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
265
- evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
288
+ evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
266
289
  evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
267
290
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
268
- evalscope/utils/chat_service.py,sha256=eZ8uyVeVFpXZo_uvRFyVhnFyJpL14zcn9UA6K4Ax5J4,8676
291
+ evalscope/utils/chat_service.py,sha256=9LNTT-8KsacOLqnQer8j57e224rwOMbU7txV6re-X-A,8720
269
292
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
293
+ evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
270
294
  evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
271
- evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
272
- evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
273
- evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
295
+ evalscope/utils/logger.py,sha256=barHSdtbEu21ynGQj_wS-rd7B02wPPR5AgaWCQzvG4w,3638
296
+ evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
297
+ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
274
298
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
275
299
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
276
300
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
277
- tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
278
- tests/cli/test_run.py,sha256=gtId2SF1LlDCIn4S_WKRpAyTig_pWOhY8yto4P5B1EY,8303
301
+ tests/cli/test_collection.py,sha256=-CrcAiZVtsY7mXUNVlRjhFWEgmPL5k1dH9PjNhKzYdU,3028
302
+ tests/cli/test_run.py,sha256=flwZZ1PyMnrxy5f36mdUeGSO_ANpr2588dw1zHVQYJY,12735
279
303
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
280
304
  tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
281
305
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -288,9 +312,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
288
312
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
289
313
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
290
314
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
291
- evalscope-0.11.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
292
- evalscope-0.11.0.dist-info/METADATA,sha256=GL8Ybyby65DYg8jxjxzdcFYvXBhKzE7eRFIBRiJ0-hc,29584
293
- evalscope-0.11.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
294
- evalscope-0.11.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
295
- evalscope-0.11.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
296
- evalscope-0.11.0.dist-info/RECORD,,
315
+ evalscope-0.12.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
316
+ evalscope-0.12.1.dist-info/METADATA,sha256=jdU1I5E3YNc8PLfY0NYYDTKiXzTE4HYtX5J6OUPkQ_s,31337
317
+ evalscope-0.12.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
318
+ evalscope-0.12.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
319
+ evalscope-0.12.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
320
+ evalscope-0.12.1.dist-info/RECORD,,
@@ -44,7 +44,7 @@ class TestCollection(unittest.TestCase):
44
44
  from evalscope import TaskConfig, run_task
45
45
 
46
46
  task_cfg = TaskConfig(
47
- model='Qwen2.5-7B-Instruct',
47
+ model='Qwen2.5-0.5B-Instruct',
48
48
  api_url='http://127.0.0.1:8801/v1/chat/completions',
49
49
  api_key='EMPTY',
50
50
  eval_type=EvalType.SERVICE,
tests/cli/test_run.py CHANGED
@@ -4,7 +4,8 @@ import subprocess
4
4
  import torch
5
5
  import unittest
6
6
 
7
- from evalscope.constants import EvalType
7
+ from evalscope.config import TaskConfig
8
+ from evalscope.constants import EvalType, OutputType
8
9
  from evalscope.run import run_task
9
10
  from evalscope.utils import is_module_installed, test_level_list
10
11
  from evalscope.utils.logger import get_logger
@@ -71,21 +72,104 @@ class TestRun(unittest.TestCase):
71
72
 
72
73
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
73
74
  def test_run_task(self):
74
- task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
75
- 'datasets': [
76
- # 'mmlu_pro',
77
- # 'bbh',
78
- # 'hellaswag',
79
- # 'gsm8k',
80
- # 'arc',
81
- # 'race',
82
- 'ifeval',
83
- # 'truthful_qa',
84
- # 'trivia_qa',
85
- ],
86
- 'limit': 2,
87
- 'eval_batch_size': 2,
88
- 'debug': True}
75
+ task_cfg = TaskConfig(
76
+ model='qwen/Qwen2.5-0.5B-Instruct',
77
+ datasets=[
78
+ 'iquiz',
79
+ # 'ifeval',
80
+ # 'mmlu',
81
+ # 'mmlu_pro',
82
+ # 'musr',
83
+ # 'process_bench',
84
+ # 'race',
85
+ # 'trivia_qa',
86
+ # 'cmmlu',
87
+ # 'humaneval',
88
+ # 'super_gpqa',
89
+ # 'gsm8k',
90
+ # 'bbh',
91
+ # 'competition_math',
92
+ # 'math_500',
93
+ 'aime24',
94
+ 'gpqa',
95
+ # 'arc',
96
+ # 'ceval',
97
+ # 'hellaswag',
98
+ # 'general_mcq',
99
+ # 'general_qa'
100
+ ],
101
+ dataset_args={
102
+ 'mmlu': {
103
+ 'subset_list': ['elementary_mathematics'],
104
+ 'few_shot_num': 0
105
+ },
106
+ 'mmlu_pro': {
107
+ 'subset_list': ['math', 'health'],
108
+ 'few_shot_num': 4
109
+ },
110
+ 'ceval': {
111
+ 'subset_list': [
112
+ 'computer_network', 'operating_system', 'computer_architecture'
113
+ ],
114
+ 'few_shot_num': 0
115
+ },
116
+ 'cmmlu': {
117
+ 'subset_list': ['elementary_chinese'],
118
+ 'few_shot_num': 0
119
+ },
120
+ 'bbh': {
121
+ 'subset_list': ['word_sorting', 'movie_recommendation'],
122
+ },
123
+ 'gpqa': {
124
+ 'subset_list': ['gpqa_diamond'],
125
+ 'few_shot_num': 0
126
+ },
127
+ 'humaneval': {
128
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
129
+ },
130
+ 'competition_math': {
131
+ 'subset_list': ['Level 1']
132
+ },
133
+ 'process_bench': {
134
+ 'subset_list': ['gsm8k'],
135
+ },
136
+ 'musr': {
137
+ 'subset_list': ['murder_mysteries']
138
+ },
139
+ 'general_mcq': {
140
+ 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
141
+ 'subset_list': [
142
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
143
+ ],
144
+ 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
145
+ },
146
+ 'general_qa': {
147
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
148
+ 'subset_list': [
149
+ 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
150
+ # 'test'
151
+ ],
152
+ 'metric_list': ['AverageBLEU']
153
+ },
154
+ 'super_gpqa': {
155
+ 'subset_list': ['Philosophy', 'Education'],
156
+ 'few_shot_num': 0
157
+ },
158
+ 'ifeval': {
159
+ 'filters': {
160
+ 'remove_until': '</think>'
161
+ }
162
+ }
163
+ },
164
+ limit=2,
165
+ eval_batch_size=2,
166
+ generation_config={
167
+ 'max_new_tokens': 2048,
168
+ 'temperature': 0.7,
169
+ 'num_return_sequences': 1,
170
+ },
171
+ # debug=True
172
+ )
89
173
  run_task(task_cfg=task_cfg)
90
174
 
91
175
 
@@ -101,7 +185,8 @@ class TestRun(unittest.TestCase):
101
185
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
102
186
  'subset_list': [
103
187
  'example' # 评测数据集名称,上述 *_dev.csv 中的 *
104
- ]
188
+ ],
189
+ 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
105
190
  },
106
191
  'general_qa': {
107
192
  'local_path': 'custom_eval/text/qa', # 自定义数据集路径
@@ -111,7 +196,8 @@ class TestRun(unittest.TestCase):
111
196
  }
112
197
  },
113
198
  )
114
- run_task(task_cfg=task_cfg)
199
+ res = run_task(task_cfg=task_cfg)
200
+ print(res)
115
201
 
116
202
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
117
203
  def test_run_humaneval(self):
@@ -140,7 +226,7 @@ class TestRun(unittest.TestCase):
140
226
 
141
227
  task_cfg = TaskConfig(
142
228
  model='Qwen2.5-0.5B-Instruct',
143
- api_url='http://127.0.0.1:8801/v1/chat/completions',
229
+ api_url='http://127.0.0.1:8801/v1',
144
230
  api_key='EMPTY',
145
231
  eval_type=EvalType.SERVICE,
146
232
  datasets=[
@@ -148,19 +234,24 @@ class TestRun(unittest.TestCase):
148
234
  # 'ifeval',
149
235
  # 'mmlu',
150
236
  # 'mmlu_pro',
237
+ # 'musr',
238
+ # 'process_bench',
151
239
  # 'race',
152
240
  # 'trivia_qa',
153
241
  # 'cmmlu',
154
242
  # 'humaneval',
155
243
  # 'gsm8k',
156
244
  # 'bbh',
157
- 'competition_math',
158
- 'math_500',
159
- 'aime24',
245
+ # 'competition_math',
246
+ # 'math_500',
247
+ # 'aime24',
160
248
  'gpqa',
161
249
  # 'arc',
162
- # 'ceval',
250
+ 'ceval',
163
251
  # 'hellaswag',
252
+ # 'general_mcq',
253
+ # 'general_qa'
254
+ # 'super_gpqa',
164
255
  ],
165
256
  dataset_args={
166
257
  'mmlu': {
@@ -168,8 +259,8 @@ class TestRun(unittest.TestCase):
168
259
  'few_shot_num': 0
169
260
  },
170
261
  'mmlu_pro': {
171
- 'subset_list': ['math'],
172
- 'few_shot_num': 0
262
+ 'subset_list': ['math', 'health'],
263
+ 'few_shot_num': 4
173
264
  },
174
265
  'ceval': {
175
266
  'subset_list': [
@@ -185,8 +276,9 @@ class TestRun(unittest.TestCase):
185
276
  'subset_list': ['word_sorting', 'movie_recommendation'],
186
277
  },
187
278
  'gpqa': {
188
- 'subset_list': ['gpqa_diamond'],
189
- 'few_shot_num': 0
279
+ # 'subset_list': ['gpqa_diamond'],
280
+ 'few_shot_num': 0,
281
+ 'local_path': './data/data/gpqa',
190
282
  },
191
283
  'humaneval': {
192
284
  'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
@@ -194,15 +286,42 @@ class TestRun(unittest.TestCase):
194
286
  'competition_math': {
195
287
  'subset_list': ['Level 1']
196
288
  },
289
+ 'process_bench': {
290
+ 'subset_list': ['gsm8k'],
291
+ },
292
+ 'musr': {
293
+ 'subset_list': ['murder_mysteries']
294
+ },
295
+ 'general_mcq': {
296
+ 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
297
+ 'subset_list': [
298
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
299
+ ],
300
+ 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
301
+ },
302
+ 'general_qa': {
303
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
304
+ 'subset_list': [
305
+ 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
306
+ # 'test'
307
+ ],
308
+ 'metric_list': ['AverageBLEU']
309
+ },
310
+ 'super_gpqa': {
311
+ # 'subset_list': ['Philosophy', 'Education'],
312
+ 'few_shot_num': 0
313
+ }
197
314
  },
198
- eval_batch_size=5,
315
+ eval_batch_size=32,
199
316
  limit=10,
200
- debug=True,
317
+ # debug=True,
318
+ stream=False,
201
319
  generation_config={
202
- 'temperature': 0.7,
203
- 'n': 5
320
+ 'temperature': 0,
321
+ 'n': 1,
322
+ 'max_tokens': 4096,
204
323
  },
205
- use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525'
324
+ # use_cache='./outputs/20250212_150525',
206
325
  )
207
326
 
208
327
  run_task(task_cfg=task_cfg)
File without changes